diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s index 574ee3c72..c9702eaec 100644 --- a/bsd/crypto/aes/i386/aes_modes_hw.s +++ b/bsd/crypto/aes/i386/aes_modes_hw.s @@ -1,1669 +1,1622 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - These subroutines implement multiple block AES modes for ECB, CBC, CFB, - OFB and CTR encryption, The code provides support for the VIA Advanced - Cryptography Engine (ACE). - - NOTE: In the following subroutines, the AES contexts (ctx) must be - 16 byte aligned if VIA ACE is being used -*/ - -/* modified 3/5/10 cclee */ -/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */ -/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */ - -/* HW-AES specific implementation cclee 3-12-10 */ -/* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, - and if kHasAES is detected, branch to the hw-specific functions here */ - - -/* - This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation - of _aes_encrypt_cbc and _aes_decrypt_cbc. - - These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. - They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. - - The AES HW is detected 1st thing in - _aes_encrypt_cbc (aes_modes_asm.s) - _aes_decrypt_cbc (aes_modes_asm.s) - and, if AES HW is detected, branch without link (ie, jump) to the functions here. - - The implementation here follows the examples in an Intel White Paper - "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 - - Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 - - cclee 3-13-10 -*/ - -/* - The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block - in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks - in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput. - - The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) - - This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode. - On a 2.4GHz core-i5/2.66GHz core-i7, the x86_64 decrypt throughput (in xnu-iokit) has been improved - from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption. - The encrypt throughput is not changed. - - I also enhanced the assembly code comments. - - cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.) - -*/ - -/* ---------------------------------------------------------------------------------------------------------------- - - aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : - - For simplicity, I am assuming all variables are in 128-bit data type. - - aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) - { - while(num_blk--) { - *iv ^= *ibuf++; - aes_encrypt(iv, iv, ctx); - *obuf++ = *iv; - } - return 0; - } - - The following is an implementation of this function using Intel AESNI. - This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. - Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch - to this aesni-based function should it detecs that aesni is available. - Blindly call this function SURELY will cause a CRASH on systems with no aesni support. - - Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks - are serially chained. This prevents us from arranging several blocks for encryption in parallel. - - ----------------------------------------------------------------------------------------------------------------*/ - - .text - .align 4,0x90 - .globl _aes_encrypt_cbc_hw -_aes_encrypt_cbc_hw: - - // push/save registers for local use -#if defined __i386__ - - push %ebp - movl %esp, %ebp - push %ebx - push %edi - - #define sp %esp - -#else // __x86_64__ - - push %rbp - mov %rsp, %rbp - push %rbx - push %r13 - push %r14 - push %r15 - - #define sp %rsp - -#endif - - // if this is kernel code, need to save used xmm registers -#ifdef KERNEL - -#if defined __i386__ - sub $(8*16), %esp // for possible xmm0-xmm7 save/restore -#else - sub $(16*16), %rsp // xmm0-xmm15 save/restore -#endif - - movaps %xmm0, (sp) - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm3, 48(sp) - movaps %xmm4, 64(sp) - movaps %xmm5, 80(sp) - movaps %xmm6, 96(sp) - movaps %xmm7, 112(sp) -#if defined __x86_64__ - movaps %xmm8, 16*8(sp) - movaps %xmm9, 16*9(sp) - movaps %xmm10, 16*10(sp) - movaps %xmm11, 16*11(sp) - movaps %xmm12, 16*12(sp) - movaps %xmm13, 16*13(sp) - movaps %xmm14, 16*14(sp) - movaps %xmm15, 16*15(sp) -#endif // __x86_64__ - -#endif // KERNEL - - #define iv %xmm0 - -#ifdef __i386__ - - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), iv // iv = in_iv - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %ecx // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %ecx - #define ctx %edx - -#else - - mov %rdi, %rbx // ibuf - movups (%rsi), iv // iv = in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - mov 240(ctx), %eax // aes length - cmp $160, %eax // aes-128 encrypt ? - je L_encrypt_128 - cmp $192, %eax // aes-192 encrypt ? - je L_encrypt_192 - cmp $224, %eax // aes-256 encrypt ? - je L_encrypt_256 - mov $-1, %eax // return error - jmp L_error - - // - // aes-128 encrypt_cbc operation, up to L_HW_cbc_done - // - -L_encrypt_128: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm2, iv // 1st instruction inside aes_encrypt - pxor %xmm1, iv // *iv ^= *ibuf - - // finishing up the rest of aes_encrypt - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenclast %xmm12, iv -#else - movups 96(ctx), %xmm1 // key6 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 // key7 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 // key8 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 // key9 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 // keyA - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, obuf // obuf++; - add $16, ibuf // ibuf++; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) - -L_HW_cbc_done: - - xor %eax, %eax // to return CRYPT_OK - -L_error: - - // if kernel, restore xmm registers -#ifdef KERNEL - movaps 0(sp), %xmm0 - movaps 16(sp), %xmm1 - movaps 32(sp), %xmm2 - movaps 48(sp), %xmm3 - movaps 64(sp), %xmm4 - movaps 80(sp), %xmm5 - movaps 96(sp), %xmm6 - movaps 112(sp), %xmm7 -#if defined __x86_64__ - movaps 16*8(sp), %xmm8 - movaps 16*9(sp), %xmm9 - movaps 16*10(sp), %xmm10 - movaps 16*11(sp), %xmm11 - movaps 16*12(sp), %xmm12 - movaps 16*13(sp), %xmm13 - movaps 16*14(sp), %xmm14 - movaps 16*15(sp), %xmm15 -#endif // __x86_64__ -#endif // KERNEL - - // release used stack memory, restore used callee-saved registers, and return -#if defined __i386__ -#ifdef KERNEL - add $(8*16), %esp -#endif - pop %edi - pop %ebx -#else -#ifdef KERNEL - add $(16*16), %rsp -#endif - pop %r15 - pop %r14 - pop %r13 - pop %rbx -#endif - leave - ret - - // - // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_encrypt_192: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA - movups 176(ctx), %xmm13 // keyB - movups 192(ctx), %xmm14 // keyC -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm1, iv // *iv ^= ibuf - - // aes_encrypt(iv, iv, ctx); - - pxor %xmm2, iv - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenc %xmm12, iv - aesenc %xmm13, iv - aesenclast %xmm14, iv -#else - movups 96(ctx), %xmm1 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 - aesenc %xmm1, iv - movups 176(ctx), %xmm1 - aesenc %xmm1, iv - movups 192(ctx), %xmm1 - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, ibuf // ibuf++ - add $16, obuf // obuf++ - - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done // share with the common exit code - - // - // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_encrypt_256: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA - movups 176(ctx), %xmm13 // keyB - movups 192(ctx), %xmm14 // keyC - movups 208(ctx), %xmm15 // keyD - // movups 224(ctx), %xmm1 // keyE -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm1, iv // *iv ^= ibuf - - // aes_encrypt(iv, iv, ctx); - pxor %xmm2, iv - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - movups 224(ctx), %xmm1 // keyE - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenc %xmm12, iv - aesenc %xmm13, iv - aesenc %xmm14, iv - aesenc %xmm15, iv - aesenclast %xmm1, iv -#else - movups 96(ctx), %xmm1 // key6 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 // key7 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 // key8 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 // key9 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 // keyA - aesenc %xmm1, iv - movups 176(ctx), %xmm1 // keyB - aesenc %xmm1, iv - movups 192(ctx), %xmm1 // keyC - aesenc %xmm1, iv - movups 208(ctx), %xmm1 // keyD - aesenc %xmm1, iv - movups 224(ctx), %xmm1 // keyE - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, ibuf // ibuf++ - add $16, obuf // obuf++ - - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done // share with the common exit code - - - - // - // --------- END of aes_encrypt_cbc_hw ------------------- - // - - -/* ---------------------------------------------------------------------------------------------------------------- - - aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : - - For simplicity, I am assuming all variables are in 128-bit data type. - - aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) - { - while(num_blk--) { - aes_decrypt(ibuf, obuf, ctx); - *obuf++ ^= *iv; - *iv = *ibuf++; - } - return 0; - } - - The following is an implementation of this function using Intel AESNI. - This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. - Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch - to this aesni-based function should it detecs that aesni is available. - Blindly call this function SURELY will cause a CRASH on systems with no aesni support. - - Note that the decryption operation is not related over blocks. - This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. - This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) - The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. - - Example C code for packing 4 blocks in an iteration is shown as follows: - - while ((num_blk-=4)>=0) { - - // the following 4 functions can be interleaved to exploit parallelism - aes_decrypt(ibuf, obuf, ctx); - aes_decrypt(ibuf+1, obuf+1, ctx); - aes_decrypt(ibuf+2, obuf+2, ctx); - aes_decrypt(ibuf+3, obuf+3, ctx); - - obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - *iv = ibuf[3]; ibuf += 4; obuf += 4; - } - num_blk+=4; - - ----------------------------------------------------------------------------------------------------------------*/ - - .text - .align 4,0x90 - .globl _aes_decrypt_cbc_hw -_aes_decrypt_cbc_hw: - - // push/save registers for local use -#if defined __i386__ - - push %ebp - movl %esp, %ebp - push %ebx // ibuf - push %edi // obuf - - #define sp %esp - -#else // __x86_64__ - - push %rbp - mov %rsp, %rbp - push %rbx - push %r13 - push %r14 - push %r15 - - #define sp %rsp - -#endif - - - // if kernel, allocate stack space to save xmm registers -#ifdef KERNEL -#if defined __i386__ - sub $(8*16), %esp -#else - sub $(16*16), %rsp -#endif - movaps %xmm0, (sp) - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm3, 48(sp) - movaps %xmm4, 64(sp) - movaps %xmm5, 80(sp) - movaps %xmm6, 96(sp) - movaps %xmm7, 112(sp) -#if defined __x86_64__ - movaps %xmm8, 16*8(sp) - movaps %xmm9, 16*9(sp) - movaps %xmm10, 16*10(sp) - movaps %xmm11, 16*11(sp) - movaps %xmm12, 16*12(sp) - movaps %xmm13, 16*13(sp) - movaps %xmm14, 16*14(sp) - movaps %xmm15, 16*15(sp) -#endif // __x86_64__ -#endif - - #undef iv - #define iv %xmm0 - -#if defined __i386__ - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), iv // iv = in_iv - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %ecx // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %ecx - #define ctx %edx - -#else // __x86_64__, rdi/rsi/rdx/rcx/r8 - - mov %rdi, %rbx // ibuf - movups (%rsi), iv // iv = in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - mov 240(ctx), %eax // aes length - cmp $160, %eax // aes-128 decrypt - je L_decrypt_128 - cmp $192, %eax // aes-192 decrypt - je L_decrypt_192 - cmp $224, %eax // aes-256 decrypt - je L_decrypt_256 - - mov $-1, %eax // wrong aes length, to return -1 - jmp L_error // early exit due to wrong aes length - - - // - // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_128: - - cmp $1, num_blk - jl L_HW_cbc_done // if num_blk < 1, early return - - // aes-128 decrypt expanded keys - movups 160(ctx), %xmm3 - movups 144(ctx), %xmm4 - movups 128(ctx), %xmm5 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#if defined __x86_64__ - movups 80(ctx), %xmm8 - movups 64(ctx), %xmm9 - movups 48(ctx), %xmm10 - movups 32(ctx), %xmm11 - movups 16(ctx), %xmm12 - movups 0(ctx), %xmm13 -#endif - - // performs 4 block decryption in an iteration to exploit decrypt in parallel - - // while ((num_blk-=4)>=0) { - // aes_decrypt(ibuf, obuf, ctx); - // aes_decrypt(ibuf+1, obuf+1, ctx); - // aes_decrypt(ibuf+2, obuf+2, ctx); - // aes_decrypt(ibuf+3, obuf+3, ctx); - // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - // *iv = ibuf[3]; ibuf += 4; obuf += 4; - // } - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code - -0: - - -#if defined __x86_64__ - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // for x86_64, the expanded keys are already stored in xmm3-xmm13 - - // aes-128 decrypt round 0 per 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - // aes-128 decrypt round 1 per 4 blocks - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - // aes-128 decrypt round 2 per 4 blocks - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - // aes-128 decrypt round 3 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - // aes-128 decrypt round 4 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - // aes-128 decrypt round 5 per 4 blocks - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - // aes-128 decrypt round 6 per 4 blocks - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - // aes-128 decrypt round 7 per 4 blocks - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - // aes-128 decrypt round 8 per 4 blocks - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - // aes-128 decrypt round 9 per 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - // aes-128 decrypt round 10 (last) per 4 blocks - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; - movups 16(ibuf), iv // ibuf[1] - pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; - movups 32(ibuf), iv // ibuf[2] - pxor iv, %xmm15 // obuf[3] ^= obuf[2]; - movups 48(ibuf), iv // *iv = ibuf[3] - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - -#else - - // aes_decrypt_cbc per 4 blocks using aes-128 for i386 - // xmm1/xmm2/xmm4/xmm5 used for obuf per block - // xmm3 = key0 - // xmm0 = iv - // xmm6/xmm7 dynamically load with other expanded keys - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - - movups 144(ctx), %xmm6 // key1 - - // aes-128 decrypt round 0 per 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 128(ctx), %xmm7 // key2 - - // aes-128 decrypt round 1 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 // key3 - - // aes-128 decrypt round 2 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 // key4 - - // aes-128 decrypt round 3 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 // key5 - - // aes-128 decrypt round 4 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 // key6 - - // aes-128 decrypt round 5 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 // key7 - - // aes-128 decrypt round 6 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 // key8 - - // aes-128 decrypt round 7 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 // key9 - - // aes-128 decrypt round 8 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 // keyA - - // aes-128 decrypt round 9 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - // aes-128 decrypt round 10 (last) per 4 blocks - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf -#endif - - add $64, ibuf // ibuf += 4; - add $64, obuf // obuf += 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - -#if defined __i386__ - // updated as they might be needed as expanded keys in the remaining - movups 144(ctx), %xmm4 - movups 128(ctx), %xmm5 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#endif - - test $2, num_blk // check whether num_blk has 2 blocks - je 9f // if num_blk & 2 == 0, skip the per-pair processing code - - // do the remaining 2 blocks together - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - - // aes_decrypt - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 -#if defined __x86_64__ - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 -#else - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#endif - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0] - movups 16(ibuf), iv // *iv = ibuf[1] - - movups %xmm1, (obuf) // write obuf[0] - movups %xmm2, 16(obuf) // write obuf[1] - - add $32, ibuf // ibuf += 2 - add $32, obuf // obuf += 2 - -9: - test $1, num_blk // check whether num_blk has residual 1 block - je L_HW_cbc_done // if num_blk == 0, no need for residual processing code - - movups (ibuf), %xmm2 // tmp = ibuf - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdeclast %xmm13, %xmm2 -#else - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 -#endif - - pxor iv, %xmm2 // *obuf ^= *iv; - movups (ibuf), iv // *iv = *ibuf; - movups %xmm2, (obuf) // write *obuf - - jmp L_HW_cbc_done - - // - // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_192: - - cmp $1, num_blk - jl L_HW_cbc_done // if num_blk < 1, early return - - // aes-192 decryp expanded keys - movups 192(ctx), %xmm3 - movups 176(ctx), %xmm4 - movups 160(ctx), %xmm5 - movups 144(ctx), %xmm6 - movups 128(ctx), %xmm7 -#if defined __x86_64__ - movups 112(ctx), %xmm8 - movups 96(ctx), %xmm9 - movups 80(ctx), %xmm10 - movups 64(ctx), %xmm11 - movups 48(ctx), %xmm12 - movups 32(ctx), %xmm13 - movups 16(ctx), %xmm14 - movups (ctx), %xmm15 -#endif - - // performs 4 block decryption in an iteration to exploit decrypt in parallel - - // while ((num_blk-=4)>=0) { - // aes_decrypt(ibuf, obuf, ctx); - // aes_decrypt(ibuf+1, obuf+1, ctx); - // aes_decrypt(ibuf+2, obuf+2, ctx); - // aes_decrypt(ibuf+3, obuf+3, ctx); - // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - // *iv = ibuf[3]; ibuf += 4; obuf += 4; - // } - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code -0: - -#if defined __x86_64__ - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 - // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards - - // round 0 for 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - // round 1 for 4 blocks - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - // round 2 for 4 blocks - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - // round 3 for 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - // round 4 for 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - // round 5 for 4 blocks - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - // round 6 for 4 blocks - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - // round 7 for 4 blocks - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - // round 8 for 4 blocks - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - // round 9 for 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - movups 16(ctx), %xmm12 - - // round A for 4 blocks - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - - movups (ctx), %xmm13 - - // round B for 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - movups 48(ctx), %xmm12 // restore %xmm12 to its original key - - // round C (last) for 4 blocks - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - - movups 32(ctx), %xmm13 // restore %xmm13 to its original key - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0] - movups 16(ibuf), iv // ibuf[1] - pxor iv, %xmm14 // obuf[2] ^= ibuf[1] - movups 32(ibuf), iv // ibuf[2] - pxor iv, %xmm15 // obuf[3] ^= ibuf[2] - movups 48(ibuf), iv // *iv = ibuf[3] - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += 4; - add $64, obuf // obuf += 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, prepare to return - - movups 16(ctx), %xmm14 // restore %xmm14 to its key - movups (ctx), %xmm15 // restore %xmm15 to its key - -#else - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - movups 176(ctx), %xmm6 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 160(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 144(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 128(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; - add $64, obuf // obuf += AES_BLOCK_SIZE * 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 176(ctx), %xmm4 - movups 160(ctx), %xmm5 - movups 144(ctx), %xmm6 - movups 128(ctx), %xmm7 - -#endif - - // per-block aes_decrypt_cbc loop - -0: - movups (ibuf), %xmm2 // tmp = ibuf - - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdec %xmm13, %xmm2 - aesdec %xmm14, %xmm2 - aesdeclast %xmm15, %xmm2 -#else - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 -#endif - - pxor iv, %xmm2 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm2, (obuf) // write obuf - - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done - - // - // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_256: - - cmp $1, num_blk - jl L_HW_cbc_done - - movups 224(ctx), %xmm3 - movups 208(ctx), %xmm4 - movups 192(ctx), %xmm5 - movups 176(ctx), %xmm6 - movups 160(ctx), %xmm7 -#if defined __x86_64__ - movups 144(ctx), %xmm8 - movups 128(ctx), %xmm9 - movups 112(ctx), %xmm10 - movups 96(ctx), %xmm11 - movups 80(ctx), %xmm12 - movups 64(ctx), %xmm13 - movups 48(ctx), %xmm14 - movups 32(ctx), %xmm15 -// movups 16(ctx), %xmm14 -// movups (ctx), %xmm15 -#endif - -#if defined __x86_64__ - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code -0: - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 48(ctx), %xmm12 - - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - movups 32(ctx), %xmm13 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 16(ctx), %xmm12 - - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - movups (ctx), %xmm13 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 80(ctx), %xmm12 - - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - movups 64(ctx), %xmm13 - - pxor iv, %xmm1 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // obuf ^= iv; - movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm14 // obuf ^= iv; - movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm15 // obuf ^= iv; - movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; - add $64, obuf // obuf += AES_BLOCK_SIZE*4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 48(ctx), %xmm14 - movups 32(ctx), %xmm15 - -#else - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-pair processing code -0: - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - movups 208(ctx), %xmm6 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 192(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 176(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 160(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 144(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 128(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; - add $64, obuf // obuf += AES_BLOCK_SIZE * 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 208(ctx), %xmm4 - movups 192(ctx), %xmm5 - movups 176(ctx), %xmm6 - movups 160(ctx), %xmm7 - -#endif - -0: - movups (ibuf), %xmm2 // tmp = ibuf - - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdec %xmm13, %xmm2 - aesdec %xmm14, %xmm2 - aesdec %xmm15, %xmm2 -#else - movups 144(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 128(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 -#endif - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 - - pxor iv, %xmm2 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm2, (obuf) // write obuf - - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done - - // - // --------- END of aes_decrypt_cbc_hw ------------------- - // +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 31/01/2006 + + These subroutines implement multiple block AES modes for ECB, CBC, CFB, + OFB and CTR encryption, The code provides support for the VIA Advanced + Cryptography Engine (ACE). + + NOTE: In the following subroutines, the AES contexts (ctx) must be + 16 byte aligned if VIA ACE is being used +*/ + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) + { + while(num_blk--) { + *iv ^= *ibuf++; + aes_encrypt(iv, iv, ctx); + *obuf++ = *iv; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks + are serially chained. This prevents us from arranging several blocks for encryption in parallel. + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_encrypt_cbc_hw +_aes_encrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx + push %edi + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + // if this is kernel code, need to save used xmm registers +#ifdef KERNEL + +#if defined __i386__ + sub $(8*16), %esp // for possible xmm0-xmm7 save/restore +#else + sub $(16*16), %rsp // xmm0-xmm15 save/restore +#endif + + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ + +#endif // KERNEL + + #define iv %xmm0 + +#ifdef __i386__ + + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 encrypt ? + je L_encrypt_128 + cmp $192, %eax // aes-192 encrypt ? + je L_encrypt_192 + cmp $224, %eax // aes-256 encrypt ? + je L_encrypt_256 + mov $-1, %eax // return error + jmp L_error + + // + // aes-128 encrypt_cbc operation, up to L_HW_cbc_done + // + +L_encrypt_128: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm2, iv // 1st instruction inside aes_encrypt + pxor %xmm1, iv // *iv ^= *ibuf + + // finishing up the rest of aes_encrypt + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenclast %xmm12, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, obuf // obuf++; + add $16, ibuf // ibuf++; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) + +L_HW_cbc_done: + + xor %eax, %eax // to return CRYPT_OK + +L_error: + + // if kernel, restore xmm registers +#ifdef KERNEL + movaps 0(sp), %xmm0 + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm3 + movaps 64(sp), %xmm4 + movaps 80(sp), %xmm5 + movaps 96(sp), %xmm6 + movaps 112(sp), %xmm7 +#if defined __x86_64__ + movaps 16*8(sp), %xmm8 + movaps 16*9(sp), %xmm9 + movaps 16*10(sp), %xmm10 + movaps 16*11(sp), %xmm11 + movaps 16*12(sp), %xmm12 + movaps 16*13(sp), %xmm13 + movaps 16*14(sp), %xmm14 + movaps 16*15(sp), %xmm15 +#endif // __x86_64__ +#endif // KERNEL + + // release used stack memory, restore used callee-saved registers, and return +#if defined __i386__ +#ifdef KERNEL + add $(8*16), %esp +#endif + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $(16*16), %rsp +#endif + pop %r15 + pop %r14 + pop %r13 + pop %rbx +#endif + leave + ret + + // + // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_192: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenclast %xmm14, iv +#else + movups 96(ctx), %xmm1 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 + aesenc %xmm1, iv + movups 176(ctx), %xmm1 + aesenc %xmm1, iv + movups 192(ctx), %xmm1 + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + // + // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_256: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC + movups 208(ctx), %xmm15 // keyD + // movups 224(ctx), %xmm1 // keyE +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + movups 224(ctx), %xmm1 // keyE + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenc %xmm14, iv + aesenc %xmm15, iv + aesenclast %xmm1, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenc %xmm1, iv + movups 176(ctx), %xmm1 // keyB + aesenc %xmm1, iv + movups 192(ctx), %xmm1 // keyC + aesenc %xmm1, iv + movups 208(ctx), %xmm1 // keyD + aesenc %xmm1, iv + movups 224(ctx), %xmm1 // keyE + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + + + // + // --------- END of aes_encrypt_cbc_hw ------------------- + // + + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) + { + while(num_blk--) { + aes_decrypt(ibuf, obuf, ctx); + *obuf++ ^= *iv; + *iv = *ibuf++; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that the decryption operation is not related over blocks. + This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. + This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) + The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. + + Example C code for packing 4 blocks in an iteration is shown as follows: + + while ((num_blk-=4)>=0) { + + // the following 4 functions can be interleaved to exploit parallelism + aes_decrypt(ibuf, obuf, ctx); + aes_decrypt(ibuf+1, obuf+1, ctx); + aes_decrypt(ibuf+2, obuf+2, ctx); + aes_decrypt(ibuf+3, obuf+3, ctx); + + obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + *iv = ibuf[3]; ibuf += 4; obuf += 4; + } + num_blk+=4; + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_decrypt_cbc_hw +_aes_decrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx // ibuf + push %edi // obuf + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + + // if kernel, allocate stack space to save xmm registers +#ifdef KERNEL +#if defined __i386__ + sub $(8*16), %esp +#else + sub $(16*16), %rsp +#endif + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ +#endif + + #undef iv + #define iv %xmm0 + +#if defined __i386__ + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else // __x86_64__, rdi/rsi/rdx/rcx/r8 + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 decrypt + je L_decrypt_128 + cmp $192, %eax // aes-192 decrypt + je L_decrypt_192 + cmp $224, %eax // aes-256 decrypt + je L_decrypt_256 + + mov $-1, %eax // wrong aes length, to return -1 + jmp L_error // early exit due to wrong aes length + + + // + // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_128: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-128 decrypt expanded keys + movups 160(ctx), %xmm3 + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#if defined __x86_64__ + movups 80(ctx), %xmm8 + movups 64(ctx), %xmm9 + movups 48(ctx), %xmm10 + movups 32(ctx), %xmm11 + movups 16(ctx), %xmm12 + movups 0(ctx), %xmm13 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code + +0: + + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // for x86_64, the expanded keys are already stored in xmm3-xmm13 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= obuf[2]; + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + +#else + + // aes_decrypt_cbc per 4 blocks using aes-128 for i386 + // xmm1/xmm2/xmm4/xmm5 used for obuf per block + // xmm3 = key0 + // xmm0 = iv + // xmm6/xmm7 dynamically load with other expanded keys + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + + movups 144(ctx), %xmm6 // key1 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 128(ctx), %xmm7 // key2 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 // key3 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 // key4 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 // key5 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 // key6 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 // key7 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 // key8 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 // key9 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 // keyA + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf +#endif + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + +#if defined __i386__ + // updated as they might be needed as expanded keys in the remaining + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + test $2, num_blk // check whether num_blk has 2 blocks + je 9f // if num_blk & 2 == 0, skip the per-pair processing code + + // do the remaining 2 blocks together + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + + // aes_decrypt + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 +#if defined __x86_64__ + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // *iv = ibuf[1] + + movups %xmm1, (obuf) // write obuf[0] + movups %xmm2, 16(obuf) // write obuf[1] + + add $32, ibuf // ibuf += 2 + add $32, obuf // obuf += 2 + +9: + test $1, num_blk // check whether num_blk has residual 1 block + je L_HW_cbc_done // if num_blk == 0, no need for residual processing code + + movups (ibuf), %xmm2 // tmp = ibuf + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // *obuf ^= *iv; + movups (ibuf), iv // *iv = *ibuf; + movups %xmm2, (obuf) // write *obuf + + jmp L_HW_cbc_done + + // + // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_192: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-192 decryp expanded keys + movups 192(ctx), %xmm3 + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 +#if defined __x86_64__ + movups 112(ctx), %xmm8 + movups 96(ctx), %xmm9 + movups 80(ctx), %xmm10 + movups 64(ctx), %xmm11 + movups 48(ctx), %xmm12 + movups 32(ctx), %xmm13 + movups 16(ctx), %xmm14 + movups (ctx), %xmm15 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards + + // round 0 for 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // round 1 for 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // round 2 for 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // round 3 for 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // round 4 for 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // round 5 for 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // round 6 for 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // round 7 for 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // round 8 for 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // round 9 for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 16(ctx), %xmm12 + + // round A for 4 blocks + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + + movups (ctx), %xmm13 + + // round B for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 48(ctx), %xmm12 // restore %xmm12 to its original key + + // round C (last) for 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + movups 32(ctx), %xmm13 // restore %xmm13 to its original key + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1] + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= ibuf[2] + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, prepare to return + + movups 16(ctx), %xmm14 // restore %xmm14 to its key + movups (ctx), %xmm15 // restore %xmm15 to its key + +#else + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 176(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 + +#endif + + // per-block aes_decrypt_cbc loop + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdeclast %xmm15, %xmm2 +#else + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_256: + + cmp $1, num_blk + jl L_HW_cbc_done + + movups 224(ctx), %xmm3 + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 +#if defined __x86_64__ + movups 144(ctx), %xmm8 + movups 128(ctx), %xmm9 + movups 112(ctx), %xmm10 + movups 96(ctx), %xmm11 + movups 80(ctx), %xmm12 + movups 64(ctx), %xmm13 + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 +// movups 16(ctx), %xmm14 +// movups (ctx), %xmm15 +#endif + +#if defined __x86_64__ + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 48(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups 32(ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 16(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups (ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 80(ctx), %xmm12 + + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + movups 64(ctx), %xmm13 + + pxor iv, %xmm1 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // obuf ^= iv; + movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm14 // obuf ^= iv; + movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm15 // obuf ^= iv; + movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; + add $64, obuf // obuf += AES_BLOCK_SIZE*4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 + +#else + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-pair processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 208(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 192(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 176(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 + +#endif + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdec %xmm15, %xmm2 +#else + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 +#endif + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // --------- END of aes_decrypt_cbc_hw ------------------- + // diff --git a/bsd/dev/random/randomdev.c b/bsd/dev/random/randomdev.c index c081bd209..c29e9f877 100644 --- a/bsd/dev/random/randomdev.c +++ b/bsd/dev/random/randomdev.c @@ -56,6 +56,7 @@ #include #include +#include #include #include @@ -101,13 +102,14 @@ static struct cdevsw random_cdevsw = /* Used to detect whether we've already been initialized */ -static int gRandomInstalled = 0; +static UInt8 gRandomInstalled = 0; static PrngRef gPrngRef; static int gRandomError = 1; static lck_grp_t *gYarrowGrp; static lck_attr_t *gYarrowAttr; static lck_grp_attr_t *gYarrowGrpAttr; static lck_mtx_t *gYarrowMutex = 0; +static UInt8 gYarrowInitializationLock = 0; #define RESEED_TICKS 50 /* how long a reseed operation can take */ @@ -307,6 +309,27 @@ PreliminarySetup(void) { prng_error_status perr; + /* Multiple threads can enter this as a result of an earlier + * check of gYarrowMutex. We make sure that only one of them + * can enter at a time. If one of them enters and discovers + * that gYarrowMutex is no longer NULL, we know that another + * thread has initialized the Yarrow state and we can exit. + */ + + /* The first thread that enters this function will find + * gYarrowInitializationLock set to 0. It will atomically + * set the value to 1 and, seeing that it was zero, drop + * out of the loop. Other threads will see that the value is + * 1 and continue to loop until we are initialized. + */ + + while (OSTestAndSet(0, &gYarrowInitializationLock)); /* serialize access to this function */ + + if (gYarrowMutex) { + /* we've already been initialized, clear and get out */ + goto function_exit; + } + /* create a Yarrow object */ perr = prngInitialize(&gPrngRef); if (perr != 0) { @@ -321,6 +344,8 @@ PreliminarySetup(void) char buffer [16]; /* get a little non-deterministic data as an initial seed. */ + /* On OSX, securityd will add much more entropy as soon as it */ + /* comes up. On iOS, entropy is added with each system interrupt. */ microtime(&tt); /* @@ -334,7 +359,7 @@ PreliminarySetup(void) if (perr != 0) { /* an error, complain */ printf ("Couldn't seed Yarrow.\n"); - return; + goto function_exit; } /* turn the data around */ @@ -350,6 +375,10 @@ PreliminarySetup(void) gYarrowMutex = lck_mtx_alloc_init(gYarrowGrp, gYarrowAttr); fips_initialize (); + +function_exit: + /* allow other threads to figure out whether or not we have been initialized. */ + gYarrowInitializationLock = 0; } const Block kKnownAnswer = {0x92, 0xb4, 0x04, 0xe5, 0x56, 0x58, 0x8c, 0xed, 0x6c, 0x1a, 0xcd, 0x4e, 0xbf, 0x05, 0x3f, 0x68, 0x09, 0xf7, 0x3a, 0x93}; @@ -384,14 +413,11 @@ random_init(void) { int ret; - if (gRandomInstalled) + if (OSTestAndSet(0, &gRandomInstalled)) { + /* do this atomically so that it works correctly with + multiple threads */ return; - - /* install us in the file system */ - gRandomInstalled = 1; - - /* setup yarrow and the mutex */ - PreliminarySetup(); + } ret = cdevsw_add(RANDOM_MAJOR, &random_cdevsw); if (ret < 0) { @@ -409,6 +435,9 @@ random_init(void) */ devfs_make_node(makedev (ret, 1), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "urandom", 0); + + /* setup yarrow and the mutex if needed*/ + PreliminarySetup(); } int diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 0ab6f4585..673adbb91 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -948,7 +948,22 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr buildthreadkey(nextCNID, std_hfs, (CatalogKey *) &bto->iterator.key); - result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + /* + * If the CNID wraparound bit is set, then we need to validate if there + * is a cnode in the hash already with this ID (even if it no longer exists + * on disk). If so, then just skip this ID and move on to the next one. + */ + if (!std_hfs && (hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask)) { + if (hfs_chash_snoop (hfsmp, nextCNID, 1, NULL, NULL) == 0) { + /* It was found in the cnode hash!*/ + result = btExists; + } + } + + if (result == 0) { + result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + } + if ((result == btExists) && !std_hfs && (hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask)) { /* * Allow CNIDs on HFS Plus volumes to wrap around @@ -2089,6 +2104,9 @@ cat_createlink(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr * int thread_inserted = 0; int alias_allocated = 0; int result = 0; + int std_hfs; + + std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); fcb = hfsmp->hfs_catalog_cp->c_datafork; @@ -2128,8 +2146,24 @@ cat_createlink(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr * for (;;) { buildthreadkey(nextCNID, 0, (CatalogKey *) &bto->iterator.key); + + /* + * If the CNID wraparound bit is set, then we need to validate if there + * is a cnode in the hash already with this ID (even if it no longer exists + * on disk). If so, then just skip this ID and move on to the next one. + */ + if (!std_hfs && (hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask)) { + /* Verify that the CNID does not already exist in the cnode hash... */ + if (hfs_chash_snoop (hfsmp, nextCNID, 1, NULL, NULL) == 0) { + /* It was found in the cnode hash!*/ + result = btExists; + } + } + + if (result == 0) { + result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + } - result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); if ((result == btExists) && (hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask)) { /* * Allow CNIDs on HFS Plus volumes to wrap around diff --git a/bsd/hfs/hfs_chash.c b/bsd/hfs/hfs_chash.c index 997d247ae..13c58bf5b 100644 --- a/bsd/hfs/hfs_chash.c +++ b/bsd/hfs/hfs_chash.c @@ -226,7 +226,7 @@ hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiploc * */ int -hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct cat_desc *, +hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int existence_only, int (*callout)(const struct cat_desc *, const struct cat_attr *, void *), void * arg) { struct cnode *cp; @@ -242,7 +242,27 @@ hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { if (cp->c_fileid != inum) continue; - /* Skip cnodes that have been removed from the catalog */ + + /* + * Under normal circumstances, we would want to return ENOENT if a cnode is in + * the hash and it is marked C_NOEXISTS or C_DELETED. However, if the CNID + * namespace has wrapped around, then we have the possibility of collisions. + * In that case, we may use this function to validate whether or not we + * should trust the nextCNID value in the hfs mount point. + * + * If we didn't do this, then it would be possible for a cnode that is no longer backed + * by anything on-disk (C_NOEXISTS) to still exist in the hash along with its + * vnode. The cat_create routine could then create a new entry in the catalog + * re-using that CNID. Then subsequent hfs_getnewvnode calls will repeatedly fail + * trying to look it up/validate it because it is marked C_NOEXISTS. So we want + * to prevent that from happening as much as possible. + */ + if (existence_only) { + result = 0; + break; + } + + /* Skip cnodes that have been removed from the catalog */ if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { break; } diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 73c2f664a..03878ef7f 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -322,7 +322,7 @@ extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int int skiplock, int allow_deleted); extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock, int *out_flags, int *hflags); -extern int hfs_chash_snoop(struct hfsmount *, ino_t, int (*)(const struct cat_desc *, +extern int hfs_chash_snoop(struct hfsmount *, ino_t, int, int (*)(const struct cat_desc *, const struct cat_attr *, void *), void *); extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, cnid_t cnid, struct cat_attr *cattr, int *error); @@ -345,6 +345,8 @@ extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); * E. Overflow Extents B-tree file (always exclusive, supports recursion) * 5. hfs mount point (always last) * + * + * I. HFS cnode hash lock (must not acquire any new locks while holding this lock, always taken last) */ enum hfslocktype {HFS_SHARED_LOCK = 1, HFS_EXCLUSIVE_LOCK = 2, HFS_FORCE_LOCK = 3, HFS_RECURSE_TRUNCLOCK = 4}; #define HFS_SHARED_OWNER (void *)0xffffffff diff --git a/bsd/hfs/hfs_quota.c b/bsd/hfs/hfs_quota.c index 5c219f0d8..7224278c7 100644 --- a/bsd/hfs/hfs_quota.c +++ b/bsd/hfs/hfs_quota.c @@ -106,6 +106,7 @@ hfs_getinoquota(cp) struct hfsmount *hfsmp; struct vnode *vp; int error; + int drop_usrquota = false; vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; hfsmp = VTOHFS(vp); @@ -113,20 +114,30 @@ hfs_getinoquota(cp) * Set up the user quota based on file uid. * EINVAL means that quotas are not enabled. */ - if (cp->c_dquot[USRQUOTA] == NODQUOT && - (error = - dqget(cp->c_uid, &hfsmp->hfs_qfiles[USRQUOTA], USRQUOTA, &cp->c_dquot[USRQUOTA])) && - error != EINVAL) - return (error); + if (cp->c_dquot[USRQUOTA] == NODQUOT) { + error = dqget(cp->c_uid, &hfsmp->hfs_qfiles[USRQUOTA], USRQUOTA, &cp->c_dquot[USRQUOTA]); + if ((error != 0) && (error != EINVAL)) { + return error; + } else if (error == 0) { + drop_usrquota = true; + } + } + /* * Set up the group quota based on file gid. * EINVAL means that quotas are not enabled. */ - if (cp->c_dquot[GRPQUOTA] == NODQUOT && - (error = - dqget(cp->c_gid, &hfsmp->hfs_qfiles[GRPQUOTA], GRPQUOTA, &cp->c_dquot[GRPQUOTA])) && - error != EINVAL) - return (error); + if (cp->c_dquot[GRPQUOTA] == NODQUOT) { + error = dqget(cp->c_gid, &hfsmp->hfs_qfiles[GRPQUOTA], GRPQUOTA, &cp->c_dquot[GRPQUOTA]); + if ((error != 0) && (error != EINVAL)) { + if (drop_usrquota == true) { + dqrele(cp->c_dquot[USRQUOTA]); + cp->c_dquot[USRQUOTA] = NODQUOT; + } + return error; + } + } + return (0); } diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 27901f5de..7bf65093c 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -942,7 +942,7 @@ do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, struct cinfo c_info; /* otherwise, check the cnode hash incase the file/dir is incore */ - if (hfs_chash_snoop(hfsmp, cnid, snoop_callback, &c_info) == 0) { + if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) { cnattrp->ca_uid = c_info.uid; cnattrp->ca_gid = c_info.gid; cnattrp->ca_mode = c_info.mode; diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 1a1ca2aa4..0bac8a3eb 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -999,7 +999,7 @@ hfs_syncer(void *arg0, void *unused) // if (hfsmp->hfs_mp->mnt_pending_write_size > hfsmp->hfs_max_pending_io) { int counter=0; - uint64_t pending_io, start, rate; + uint64_t pending_io, start, rate = 0; no_max = 0; @@ -1027,7 +1027,9 @@ hfs_syncer(void *arg0, void *unused) clock_get_calendar_microtime(&secs, &usecs); now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs; hfsmp->hfs_last_sync_time = now; - rate = ((pending_io * 1000000ULL) / (now - start)); // yields bytes per second + if (now != start) { + rate = ((pending_io * 1000000ULL) / (now - start)); // yields bytes per second + } hfs_end_transaction(hfsmp); @@ -1037,7 +1039,7 @@ hfs_syncer(void *arg0, void *unused) // than 2 seconds, adjust hfs_max_pending_io so that we // will allow about 1.5 seconds of i/o to queue up. // - if ((now - start) >= 300000) { + if (((now - start) >= 300000) && (rate != 0)) { uint64_t scale = (pending_io * 100) / rate; if (scale < 100 || scale > 200) { diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 03c113475..6d93ba3c4 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -482,16 +482,6 @@ hfs_vnop_open(struct vnop_open_args *ap) if (cp->c_fileid == hfsmp->hfs_jnlfileid) return (EPERM); - /* If we're going to write to the file, initialize quotas. */ -#if QUOTA - if ((ap->a_mode & FWRITE) && (hfsmp->hfs_flags & HFS_QUOTAS)) - (void)hfs_getinoquota(cp); -#endif /* QUOTA */ - - /* - * On the first (non-busy) open of a fragmented - * file attempt to de-frag it (if its less than 20MB). - */ if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (hfsmp->jnl == NULL) || #if NAMEDSTREAMS @@ -504,6 +494,17 @@ hfs_vnop_open(struct vnop_open_args *ap) if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) return (error); + +#if QUOTA + /* If we're going to write to the file, initialize quotas. */ + if ((ap->a_mode & FWRITE) && (hfsmp->hfs_flags & HFS_QUOTAS)) + (void)hfs_getinoquota(cp); +#endif /* QUOTA */ + + /* + * On the first (non-busy) open of a fragmented + * file attempt to de-frag it (if its less than 20MB). + */ fp = VTOF(vp); if (fp->ff_blocks && fp->ff_extents[7].blockCount != 0 && @@ -535,6 +536,7 @@ hfs_vnop_open(struct vnop_open_args *ap) vfs_context_proc(ap->a_context)); } } + hfs_unlock(cp); return (0); diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 3de273b36..4c17cd232 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -3168,7 +3168,6 @@ exec_handle_sugid(struct image_params *imgp) int i; int leave_sugid_clear = 0; int error = 0; - struct vnode *dev_null = NULLVP; #if CONFIG_MACF int mac_transition; @@ -3296,25 +3295,6 @@ exec_handle_sugid(struct image_params *imgp) if (!leave_sugid_clear) OSBitOrAtomic(P_SUGID, &p->p_flag); - /* Cache the vnode for /dev/null the first time around */ - if (dev_null == NULLVP) { - struct nameidata nd1; - - NDINIT(&nd1, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, - CAST_USER_ADDR_T("/dev/null"), - imgp->ip_vfs_context); - - if ((error = vn_open(&nd1, FREAD, 0)) == 0) { - dev_null = nd1.ni_vp; - /* - * vn_open returns with both a use_count - * and an io_count on the found vnode - * drop the io_count, but keep the use_count - */ - vnode_put(nd1.ni_vp); - } - } - /* * Radar 2261856; setuid security hole fix * XXX For setuid processes, attempt to ensure that @@ -3323,40 +3303,48 @@ exec_handle_sugid(struct image_params *imgp) * descriptors in this range which has implied meaning * to libc. */ - if (dev_null != NULLVP) { - for (i = 0; i < 3; i++) { - struct fileproc *fp; - int indx; + for (i = 0; i < 3; i++) { - if (p->p_fd->fd_ofiles[i] != NULL) - continue; + if (p->p_fd->fd_ofiles[i] != NULL) + continue; - if ((error = falloc(p, &fp, &indx, imgp->ip_vfs_context)) != 0) - continue; + /* + * Do the kernel equivalent of + * + * (void) open("/dev/null", O_RDONLY); + */ - if ((error = vnode_ref_ext(dev_null, FREAD, 0)) != 0) { - fp_free(p, indx, fp); - break; - } + struct fileproc *fp; + int indx; - fp->f_fglob->fg_flag = FREAD; - fp->f_fglob->fg_type = DTYPE_VNODE; - fp->f_fglob->fg_ops = &vnops; - fp->f_fglob->fg_data = (caddr_t)dev_null; - - proc_fdlock(p); - procfdtbl_releasefd(p, indx, NULL); - fp_drop(p, indx, fp, 1); - proc_fdunlock(p); + if ((error = falloc(p, + &fp, &indx, imgp->ip_vfs_context)) != 0) + continue; + + struct nameidata nd1; + + NDINIT(&nd1, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, + CAST_USER_ADDR_T("/dev/null"), + imgp->ip_vfs_context); + + if ((error = vn_open(&nd1, FREAD, 0)) != 0) { + fp_free(p, indx, fp); + break; } - /* - * for now we need to drop the reference immediately - * since we don't have any mechanism in place to - * release it before starting to unmount "/dev" - * during a reboot/shutdown - */ - vnode_rele(dev_null); - dev_null = NULLVP; + + struct fileglob *fg = fp->f_fglob; + + fg->fg_flag = FREAD; + fg->fg_type = DTYPE_VNODE; + fg->fg_ops = &vnops; + fg->fg_data = nd1.ni_vp; + + vnode_put(nd1.ni_vp); + + proc_fdlock(p); + procfdtbl_releasefd(p, indx, NULL); + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); } } #if CONFIG_MACF diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 3fbed7532..009ce5a7c 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -133,7 +133,7 @@ static void wq_unpark_continue(void); static void wq_unsuspend_continue(void); static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl); static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread); -static void workqueue_removethread(struct threadlist *tl); +static void workqueue_removethread(struct threadlist *tl, int fromexit); static void workqueue_lock_spin(proc_t); static void workqueue_unlock(proc_t); int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc); @@ -897,17 +897,23 @@ workqueue_callback(int type, thread_t thread) static void -workqueue_removethread(struct threadlist *tl) +workqueue_removethread(struct threadlist *tl, int fromexit) { struct workqueue *wq; struct uthread * uth; + /* + * If fromexit is set, the call is from workqueue_exit(, + * so some cleanups are to be avoided. + */ wq = tl->th_workq; TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry); - wq->wq_nthreads--; - wq->wq_thidlecount--; + if (fromexit == 0) { + wq->wq_nthreads--; + wq->wq_thidlecount--; + } /* * Clear the threadlist pointer in uthread so @@ -921,7 +927,10 @@ workqueue_removethread(struct threadlist *tl) if (uth != (struct uthread *)0) { uth->uu_threadlist = NULL; } - workqueue_unlock(wq->wq_proc); + if (fromexit == 0) { + /* during exit the lock is not held */ + workqueue_unlock(wq->wq_proc); + } if ( (tl->th_flags & TH_LIST_SUSPENDED) ) { /* @@ -930,7 +939,10 @@ workqueue_removethread(struct threadlist *tl) * since we're not going to spin up through the * normal exit path triggered from Libc */ - (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize); + if (fromexit == 0) { + /* vm map is already deallocated when this is called from exit */ + (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize); + } (void)mach_port_deallocate(get_task_ipcspace(wq->wq_task), tl->th_thport); KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread)); @@ -1048,10 +1060,10 @@ workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread) tl->th_policy = -1; uth = get_bsdthread_info(tl->th_thread); - uth->uu_threadlist = (void *)tl; workqueue_lock_spin(p); + uth->uu_threadlist = (void *)tl; TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry); wq->wq_thidlecount++; @@ -1373,21 +1385,7 @@ workqueue_exit(struct proc *p) kfree(tl, sizeof(struct threadlist)); } TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) { - - thread_sched_call(tl->th_thread, NULL); - - uth = get_bsdthread_info(tl->th_thread); - if (uth != (struct uthread *)0) { - uth->uu_threadlist = NULL; - } - TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry); - - /* - * drop our last ref on the thread - */ - thread_deallocate(tl->th_thread); - - kfree(tl, sizeof(struct threadlist)); + workqueue_removethread(tl, 1); } thread_call_free(wq->wq_atimer_call); @@ -1952,7 +1950,7 @@ wq_unsuspend_continue(void) * queue... remove it from our domain... * workqueue_removethread consumes the lock */ - workqueue_removethread(tl); + workqueue_removethread(tl, 0); thread_bootstrap_return(); } @@ -2024,7 +2022,7 @@ wq_unpark_continue(void) * * workqueue_removethread consumes the lock */ - workqueue_removethread(tl); + workqueue_removethread(tl, 0); thread_exception_return(); } diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index c374ea07e..27f2461b4 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -271,12 +271,31 @@ static lck_grp_attr_t *pipe_mtx_grp_attr; static zone_t pipe_zone; +#define PIPE_GARBAGE_AGE_LIMIT 5000 /* In milliseconds */ +#define PIPE_GARBAGE_QUEUE_LIMIT 32000 + +struct pipe_garbage { + struct pipe *pg_pipe; + struct pipe_garbage *pg_next; + uint64_t pg_timestamp; +}; + +static zone_t pipe_garbage_zone; +static struct pipe_garbage *pipe_garbage_head = NULL; +static struct pipe_garbage *pipe_garbage_tail = NULL; +static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT; +static int pipe_garbage_count = 0; +static lck_mtx_t *pipe_garbage_lock; + SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); void pipeinit(void) { - pipe_zone = (zone_t)zinit(sizeof(struct pipe), 8192 * sizeof(struct pipe), 4096, "pipe zone"); + vm_size_t zone_size; + + zone_size = 8192 * sizeof(struct pipe); + pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone"); /* * allocate lock group attribute and group for pipe mutexes @@ -288,6 +307,15 @@ pipeinit(void) * allocate the lock attribute for pipe mutexes */ pipe_mtx_attr = lck_attr_alloc_init(); + + /* + * Set up garbage collection for dead pipes + */ + zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) * + sizeof(struct pipe_garbage); + pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage), + zone_size, 4096, "pipe garbage zone"); + pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr); } /* Bitmap for things to touch in pipe_touch() */ @@ -1492,6 +1520,8 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) break; case FWRITE: + if (wpipe) + wpipe->pipe_state |= PIPE_WSELECT; if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { @@ -1559,6 +1589,78 @@ pipe_free_kmem(struct pipe *cpipe) #endif } +/* + * When a thread sets a write-select on a pipe, it creates an implicit, + * untracked dependency between that thread and the peer of the pipe + * on which the select is set. If the peer pipe is closed and freed + * before the select()ing thread wakes up, the system will panic as + * it attempts to unwind the dangling select(). To avoid that panic, + * we notice whenever a dangerous select() is set on a pipe, and + * defer the final deletion of the pipe until that select()s are all + * resolved. Since we can't currently detect exactly when that + * resolution happens, we use a simple garbage collection queue to + * reap the at-risk pipes 'later'. + */ +static void +pipe_garbage_collect(struct pipe *cpipe) +{ + uint64_t old, now; + struct pipe_garbage *pgp; + + /* Convert msecs to nsecs and then to abstime */ + old = pipe_garbage_age_limit * 1000000; + nanoseconds_to_absolutetime(old, &old); + + lck_mtx_lock(pipe_garbage_lock); + + /* Free anything that's been on the queue for seconds */ + now = mach_absolute_time(); + old = now - old; + while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) { + pipe_garbage_head = pgp->pg_next; + if (pipe_garbage_head == NULL) + pipe_garbage_tail = NULL; + pipe_garbage_count--; + zfree(pipe_zone, pgp->pg_pipe); + zfree(pipe_garbage_zone, pgp); + } + + /* Add the new pipe (if any) to the tail of the garbage queue */ + if (cpipe) { + cpipe->pipe_state = PIPE_DEAD; + pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone); + if (pgp == NULL) { + /* + * We're too low on memory to garbage collect the + * pipe. Freeing it runs the risk of panicing the + * system. All we can do is leak it and leave + * a breadcrumb behind. The good news, such as it + * is, is that this will probably never happen. + * We will probably hit the panic below first. + */ + printf("Leaking pipe %p - no room left in the queue", + cpipe); + lck_mtx_unlock(pipe_garbage_lock); + return; + } + + pgp->pg_pipe = cpipe; + pgp->pg_timestamp = now; + pgp->pg_next = NULL; + + if (pipe_garbage_tail) + pipe_garbage_tail->pg_next = pgp; + pipe_garbage_tail = pgp; + if (pipe_garbage_head == NULL) + pipe_garbage_head = pipe_garbage_tail; + + if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT) + panic("Length of pipe garbage queue exceeded %d", + PIPE_GARBAGE_QUEUE_LIMIT); + } + lck_mtx_unlock(pipe_garbage_lock); +} + /* * shutdown the pipe */ @@ -1637,9 +1739,12 @@ pipeclose(struct pipe *cpipe) } } pipe_free_kmem(cpipe); - - zfree(pipe_zone, cpipe); - + if (cpipe->pipe_state & PIPE_WSELECT) { + pipe_garbage_collect(cpipe); + } else { + zfree(pipe_zone, cpipe); + pipe_garbage_collect(NULL); + } } /*ARGSUSED*/ diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 57f522919..c4530e994 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -283,7 +283,8 @@ ip_output_list( struct in_addr pkt_dst; struct ipf_pktopts *ippo = NULL, ipf_pktopts; #if IPSEC - struct route iproute; + struct ipsec_output_state ipsec_state; + struct route *ipsec_saved_route = NULL; struct socket *so = NULL; struct secpolicy *sp = NULL; #endif @@ -311,6 +312,10 @@ ip_output_list( boolean_t select_srcif; KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); +#if IPSEC + bzero(&ipsec_state, sizeof(ipsec_state)); +#endif /* IPSEC */ + packetlist = m0; #if IPFIREWALL args.next_hop = NULL; @@ -1112,15 +1117,12 @@ ip_output_list( printf("ip_output: Invalid policy found. %d\n", sp->policy); } { - struct ipsec_output_state state; - bzero(&state, sizeof(state)); - state.m = m; + ipsec_state.m = m; if (flags & IP_ROUTETOIF) { - state.ro = &iproute; - bzero(&iproute, sizeof(iproute)); + bzero(&ipsec_state.ro, sizeof(ipsec_state.ro)); } else - state.ro = ro; - state.dst = (struct sockaddr *)dst; + route_copyout(&ipsec_state.ro, ro, sizeof(ipsec_state.ro)); + ipsec_state.dst = (struct sockaddr *)dst; ip->ip_sum = 0; @@ -1143,23 +1145,25 @@ ip_output_list( struct ip *, ip, struct ifnet *, ifp, struct ip *, ip, struct ip6_hdr *, NULL); - error = ipsec4_output(&state, sp, flags); + error = ipsec4_output(&ipsec_state, sp, flags); - m0 = m = state.m; + m0 = m = ipsec_state.m; if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore * IP_ROUTETOIF. */ - if (state.ro != &iproute || state.ro->ro_rt != NULL) { + if (ipsec_state.tunneled) { flags &= ~IP_ROUTETOIF; - ro = state.ro; + ipsec_saved_route = ro; + ro = &ipsec_state.ro; } - } else - ro = state.ro; - - dst = (struct sockaddr_in *)state.dst; + } else { + ipsec_saved_route = ro; + ro = &ipsec_state.ro; + } + dst = (struct sockaddr_in *)ipsec_state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m0 = NULL; @@ -1780,10 +1784,8 @@ ip_output_list( } #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - if (ro == &iproute && ro->ro_rt) { - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } + if (ipsec_state.ro.ro_rt) + rtfree(ipsec_state.ro.ro_rt); if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ip_output call free SP:%x\n", sp)); diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index 706ec823c..a8369ca71 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -386,6 +386,14 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) struct tcpcb *, tp, int32_t, TCPS_CLOSED); /* Become a regular mutex */ lck_mtx_convert_spin(&inp->inpcb_mtx); + + /* If this tp still happens to be on the timer list, + * take it out + */ + if (TIMER_IS_ON_LIST(tp)) { + tcp_remove_timer(tp); + } + if (inp->inp_state != INPCB_STATE_DEAD) { #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) @@ -887,10 +895,10 @@ tcp_remove_timer(struct tcpcb *tp) tp->t_flags &= ~(TF_TIMER_ONLIST); listp->entries--; - lck_mtx_unlock(listp->mtx); tp->tentry.le.le_next = NULL; tp->tentry.le.le_prev = NULL; + lck_mtx_unlock(listp->mtx); } /* Function to check if the timerlist needs to be rescheduled to run diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index f2d6e3bd6..f0d56fd7b 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -134,7 +134,6 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, struct secpolicy *sp = NULL; #endif struct timeval timenow; - int tunneledv4 = 0; unsigned int ifscope = IFSCOPE_NONE; #if PF struct pf_mtag *pf_mtag; @@ -296,15 +295,18 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, */ bzero(&state, sizeof(state)); state.m = m; - state.ro = NULL; /* update at ipsec6_output_tunnel() */ state.dst = NULL; /* update at ipsec6_output_tunnel() */ - error = ipsec6_output_tunnel(&state, sp, 0, &tunneledv4); + error = ipsec6_output_tunnel(&state, sp, 0); key_freesp(sp, KEY_SADB_UNLOCKED); - if (tunneledv4) + if (state.tunneled == 4) return; /* packet is gone - sent over IPv4 */ m = state.m; + if (state.ro.ro_rt) { + rtfree(state.ro.ro_rt); + state.ro.ro_rt = NULL; + } if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ switch (error) { diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 7abf54ca7..9b58e7ad7 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -268,7 +268,11 @@ ip6_output( int needipsectun = 0; struct socket *so = NULL; struct secpolicy *sp = NULL; + struct route_in6 *ipsec_saved_route = NULL; + struct ipsec_output_state ipsec_state; + bzero(&ipsec_state, sizeof(ipsec_state)); + /* for AH processing. stupid to have "socket" variable in IP layer... */ if (ipsec_bypass == 0) { @@ -572,7 +576,6 @@ ip6_output( { struct ip6_rthdr *rh = NULL; int segleft_org = 0; - struct ipsec_output_state state; if (exthdrs.ip6e_rthdr) { rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *); @@ -580,11 +583,10 @@ ip6_output( rh->ip6r_segleft = 0; } - bzero(&state, sizeof(state)); - state.m = m; - error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags, + ipsec_state.m = m; + error = ipsec6_output_trans(&ipsec_state, nexthdrp, mprev, sp, flags, &needipsectun); - m = state.m; + m = ipsec_state.m; if (error) { /* mbuf is already reclaimed in ipsec6_output_trans. */ m = NULL; @@ -741,10 +743,9 @@ ip6_output( dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_addr = ip6->ip6_dst; } + #if IPSEC if (needipsec && needipsectun) { - struct ipsec_output_state state; - int tunneledv4 = 0; #if CONFIG_DTRACE struct ifnet *trace_ifp = (ifpp != NULL) ? (*ifpp) : NULL; #endif /* CONFIG_DTRACE */ @@ -759,22 +760,22 @@ ip6_output( bzero(&exthdrs, sizeof(exthdrs)); exthdrs.ip6e_ip6 = m; - bzero(&state, sizeof(state)); - state.m = m; - state.ro = (struct route *)ro; - state.dst = (struct sockaddr *)dst; + ipsec_state.m = m; + route_copyout(&ipsec_state.ro, (struct route *)ro, sizeof(ipsec_state.ro)); + ipsec_state.dst = (struct sockaddr *)dst; /* Added a trace here so that we can see packets inside a tunnel */ DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, struct ip6_hdr *, ip6, struct ifnet *, trace_ifp, struct ip *, NULL, struct ip6_hdr *, ip6); - error = ipsec6_output_tunnel(&state, sp, flags, &tunneledv4); - if (tunneledv4) /* tunneled in IPv4 - packet is gone */ + error = ipsec6_output_tunnel(&ipsec_state, sp, flags); + if (ipsec_state.tunneled == 4) /* tunneled in IPv4 - packet is gone */ goto done; - m = state.m; - ro = (struct route_in6 *)state.ro; - dst = (struct sockaddr_in6 *)state.dst; + m = ipsec_state.m; + ipsec_saved_route = ro; + ro = (struct route_in6 *)&ipsec_state.ro; + dst = (struct sockaddr_in6 *)ipsec_state.dst; if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ m0 = m = NULL; @@ -1367,6 +1368,14 @@ ip6_output( ip6stat.ip6s_fragmented++; done: +#if IPSEC + if (ipsec_saved_route) { + ro = ipsec_saved_route; + if (ipsec_state.ro.ro_rt) { + rtfree(ipsec_state.ro.ro_rt); + } + } +#endif /* IPSEC */ if (ro == &ip6route && ro->ro_rt) { /* brace necessary for rtfree */ rtfree(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 91fd6db6d..6de4b97dc 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -2809,6 +2809,7 @@ ipsec4_output( int error = 0; struct sockaddr_in *dst4; struct sockaddr_in *sin; + struct route *ro4; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -2816,8 +2817,6 @@ ipsec4_output( panic("state == NULL in ipsec4_output"); if (!state->m) panic("state->m == NULL in ipsec4_output"); - if (!state->ro) - panic("state->ro == NULL in ipsec4_output"); if (!state->dst) panic("state->dst == NULL in ipsec4_output"); @@ -2962,33 +2961,32 @@ ipsec4_output( // grab sadb_mutex, before updating sah's route cache lck_mtx_lock(sadb_mutex); - state->ro = &sav->sah->sa_route; - state->dst = (struct sockaddr *)&state->ro->ro_dst; - dst4 = (struct sockaddr_in *)state->dst; - if (state->ro->ro_rt != NULL) { - RT_LOCK(state->ro->ro_rt); + ro4= &sav->sah->sa_route; + dst4 = (struct sockaddr_in *)&ro4->ro_dst; + if (ro4->ro_rt != NULL) { + RT_LOCK(ro4->ro_rt); } - if (state->ro->ro_rt != NULL && - (state->ro->ro_rt->generation_id != route_generation || - !(state->ro->ro_rt->rt_flags & RTF_UP) || + if (ro4->ro_rt != NULL && + (ro4->ro_rt->generation_id != route_generation || + !(ro4->ro_rt->rt_flags & RTF_UP) || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { - RT_UNLOCK(state->ro->ro_rt); - rtfree(state->ro->ro_rt); - state->ro->ro_rt = NULL; + RT_UNLOCK(ro4->ro_rt); + rtfree(ro4->ro_rt); + ro4->ro_rt = NULL; } - if (state->ro->ro_rt == 0) { + if (ro4->ro_rt == 0) { dst4->sin_family = AF_INET; dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; - rtalloc(state->ro); - if (state->ro->ro_rt == 0) { + rtalloc(ro4); + if (ro4->ro_rt == 0) { OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; // release sadb_mutex, after updating sah's route cache lck_mtx_unlock(sadb_mutex); goto bad; } - RT_LOCK(state->ro->ro_rt); + RT_LOCK(ro4->ro_rt); } /* @@ -3000,11 +2998,16 @@ ipsec4_output( * sockaddr via rt_setgate(). This is currently * addressed by SA_SIZE roundup in that routine. */ - if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) { - state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; - dst4 = (struct sockaddr_in *)state->dst; + if (ro4->ro_rt->rt_flags & RTF_GATEWAY) + dst4 = (struct sockaddr_in *)ro4->ro_rt->rt_gateway; + RT_UNLOCK(ro4->ro_rt); + if (state->ro.ro_rt != NULL) { + rtfree(state->ro.ro_rt); + state->ro.ro_rt = NULL; } - RT_UNLOCK(state->ro->ro_rt); + route_copyout(&state->ro, ro4, sizeof(state->ro)); + state->dst = (struct sockaddr *)dst4; + state->tunneled = 4; // release sadb_mutex, after updating sah's route cache lck_mtx_unlock(sadb_mutex); } @@ -3260,8 +3263,7 @@ int ipsec6_output_tunnel( struct ipsec_output_state *state, struct secpolicy *sp, - __unused int flags, - int *tunneledv4) + __unused int flags) { struct ip6_hdr *ip6; struct ipsecrequest *isr = NULL; @@ -3270,11 +3272,10 @@ ipsec6_output_tunnel( int error = 0; int plen; struct sockaddr_in6* dst6; + struct route *ro6; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); - *tunneledv4 = 0; - if (!state) panic("state == NULL in ipsec6_output_tunnel"); if (!state->m) @@ -3409,7 +3410,7 @@ ipsec6_output_tunnel( error = EINVAL; goto bad; } - *tunneledv4 = 1; /* must not process any further in ip6_output */ + state->tunneled = 4; /* must not process any further in ip6_output */ error = ipsec64_encapsulate(state->m, sav); if (error) { state->m = 0; @@ -3532,31 +3533,30 @@ ipsec6_output_tunnel( // grab sadb_mutex, before updating sah's route cache lck_mtx_lock(sadb_mutex); - state->ro = &sav->sah->sa_route; - state->dst = (struct sockaddr *)&state->ro->ro_dst; - dst6 = (struct sockaddr_in6 *)state->dst; - if (state->ro->ro_rt) { - RT_LOCK(state->ro->ro_rt); + ro6 = &sav->sah->sa_route; + dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; + if (ro6->ro_rt) { + RT_LOCK(ro6->ro_rt); } - if (state->ro->ro_rt != NULL && - (state->ro->ro_rt->generation_id != route_generation || - !(state->ro->ro_rt->rt_flags & RTF_UP) || + if (ro6->ro_rt != NULL && + (ro6->ro_rt->generation_id != route_generation || + !(ro6->ro_rt->rt_flags & RTF_UP) || !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) { - RT_UNLOCK(state->ro->ro_rt); - rtfree(state->ro->ro_rt); - state->ro->ro_rt = NULL; + RT_UNLOCK(ro6->ro_rt); + rtfree(ro6->ro_rt); + ro6->ro_rt = NULL; } - if (state->ro->ro_rt == 0) { + if (ro6->ro_rt == 0) { bzero(dst6, sizeof(*dst6)); dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = ip6->ip6_dst; - rtalloc(state->ro); - if (state->ro->ro_rt) { - RT_LOCK(state->ro->ro_rt); + rtalloc(ro6); + if (ro6->ro_rt) { + RT_LOCK(ro6->ro_rt); } } - if (state->ro->ro_rt == 0) { + if (ro6->ro_rt == 0) { ip6stat.ip6s_noroute++; IPSEC_STAT_INCREMENT(ipsec6stat.out_noroute); error = EHOSTUNREACH; @@ -3574,11 +3574,16 @@ ipsec6_output_tunnel( * sockaddr via rt_setgate(). This is currently * addressed by SA_SIZE roundup in that routine. */ - if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) { - state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; - dst6 = (struct sockaddr_in6 *)state->dst; + if (ro6->ro_rt->rt_flags & RTF_GATEWAY) + dst6 = (struct sockaddr_in6 *)ro6->ro_rt->rt_gateway; + RT_UNLOCK(ro6->ro_rt); + if (state->ro.ro_rt != NULL) { + rtfree(state->ro.ro_rt); + state->ro.ro_rt = NULL; } - RT_UNLOCK(state->ro->ro_rt); + route_copyout(&state->ro, ro6, sizeof(state->ro)); + state->dst = (struct sockaddr *)dst6; + state->tunneled = 6; // release sadb_mutex, after updating sah's route cache lck_mtx_unlock(sadb_mutex); } diff --git a/bsd/netinet6/ipsec.h b/bsd/netinet6/ipsec.h index 8bb0feace..c31155576 100644 --- a/bsd/netinet6/ipsec.h +++ b/bsd/netinet6/ipsec.h @@ -279,8 +279,9 @@ struct ipsecstat { #ifdef KERNEL struct ipsec_output_state { + int tunneled; struct mbuf *m; - struct route *ro; + struct route ro; struct sockaddr *dst; }; diff --git a/bsd/netinet6/ipsec6.h b/bsd/netinet6/ipsec6.h index 7a4a59050..e24f11acc 100644 --- a/bsd/netinet6/ipsec6.h +++ b/bsd/netinet6/ipsec6.h @@ -76,7 +76,7 @@ extern const char *ipsec6_logpacketstr(struct ip6_hdr *, u_int32_t); extern int ipsec6_output_trans(struct ipsec_output_state *, u_char *, struct mbuf *, struct secpolicy *, int, int *); extern int ipsec6_output_tunnel(struct ipsec_output_state *, - struct secpolicy *, int, int*); + struct secpolicy *, int); extern int ipsec6_tunnel_validate(struct mbuf *, int, u_int, struct secasvar *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index c848bfae6..f6d019ba2 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -500,12 +500,12 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) while (win_getbit(cp->gss_clnt_seqbits, ((cp->gss_clnt_seqnum - cp->gss_clnt_seqwin) + 1) % cp->gss_clnt_seqwin)) { cp->gss_clnt_flags |= GSS_NEEDSEQ; - msleep(cp, cp->gss_clnt_mtx, slpflag, "seqwin", NULL); + msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "seqwin", NULL); slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) { - lck_mtx_unlock(cp->gss_clnt_mtx); return (error); } + lck_mtx_lock(cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_INVAL) { /* Renewed while while we were waiting */ lck_mtx_unlock(cp->gss_clnt_mtx); diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index d1e130b88..a5917a297 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -966,6 +966,7 @@ nfs_vnop_close( // denyMode = NFS_OPEN_SHARE_DENY_WRITE; // else // denyMode = NFS_OPEN_SHARE_DENY_NONE; +#if 0 // Not yet if (fflag & FHASLOCK) { /* XXX assume FHASLOCK is for the deny mode and not flock */ /* FHASLOCK flock will be unlocked in the close path, but the flag is not cleared. */ @@ -978,8 +979,10 @@ nfs_vnop_close( } else { denyMode = NFS_OPEN_SHARE_DENY_NONE; } +#else // XXX don't do deny modes just yet (and never do it for !v4) denyMode = NFS_OPEN_SHARE_DENY_NONE; +#endif if (!accessMode) { /* diff --git a/bsd/sys/pipe.h b/bsd/sys/pipe.h index 71557f0b0..f45128bfb 100644 --- a/bsd/sys/pipe.h +++ b/bsd/sys/pipe.h @@ -132,6 +132,8 @@ struct pipemapping { #define PIPE_KNOTE 0x1000 /* Pipe has kernel events activated */ #define PIPE_DRAIN 0x2000 /* Waiting for I/O to drop for a close. Treated like EOF; only separate for easier debugging. */ +#define PIPE_WSELECT 0x4000 /* Some thread has done an FWRITE select on the pipe */ +#define PIPE_DEAD 0x8000 /* Pipe is dead and needs garbage collection */ #ifdef KERNEL diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 462fbef79..4280f3bfd 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -6006,13 +6006,8 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) return(EACCES); } - /* - * enforce sticky bit behaviour; the cached_delete_child property will - * be false and the dvap contents valis for sticky bit directories; - * this makes us check the directory each time, but it's unavoidable, - * as sticky bit is an exception to caching. - */ - if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { + /* enforce sticky bit behaviour */ + if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); return(EACCES); @@ -6597,7 +6592,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); goto out; } - if (dvp && parent_authorized_for_delete_child == FALSE) { + if (dvp) { VATTR_WANTED(&dva, va_mode); VATTR_WANTED(&dva, va_uid); VATTR_WANTED(&dva, va_gid); diff --git a/config/MasterVersion b/config/MasterVersion index 20db06b5d..15cb149c8 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -11.1.0 +11.2.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index 06794304e..b8f3cd2e2 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -43,6 +43,7 @@ #define kIOConsoleSessionUIDKey "kCGSSessionUserIDKey" /* value is OSNumber */ #define kIOConsoleSessionConsoleSetKey "kCGSSessionConsoleSetKey" /* value is OSNumber */ #define kIOConsoleSessionOnConsoleKey "kCGSSessionOnConsoleKey" /* value is OSBoolean */ +#define kIOConsoleSessionLoginDoneKey "kCGSessionLoginDoneKey" /* value is OSBoolean */ #define kIOConsoleSessionSecureInputPIDKey "kCGSSessionSecureInputPID" /* value is OSNumber */ #define kIOConsoleSessionScreenLockedTimeKey "CGSSessionScreenLockedTime" /* value is OSNumber, secs - 1970 */ diff --git a/iokit/IOKit/IOTypes.h b/iokit/IOKit/IOTypes.h index 3c41ab070..d56aea7be 100644 --- a/iokit/IOKit/IOTypes.h +++ b/iokit/IOKit/IOTypes.h @@ -240,6 +240,10 @@ enum { kTickScale = (kSecondScale / 100) }; +enum { + kIOConnectMethodVarOutputSize = -3 +}; + /* compatibility types */ #ifndef KERNEL diff --git a/iokit/IOKit/IOUserClient.h b/iokit/IOKit/IOUserClient.h index c3c40c57a..aaab07ebe 100644 --- a/iokit/IOKit/IOUserClient.h +++ b/iokit/IOKit/IOUserClient.h @@ -138,7 +138,11 @@ struct IOExternalMethodArguments IOMemoryDescriptor * structureOutputDescriptor; uint32_t structureOutputDescriptorSize; - uint32_t __reserved[32]; + uint32_t __reservedA; + + OSObject ** structureVariableOutputData; + + uint32_t __reserved[30]; }; typedef IOReturn (*IOExternalMethodAction)(OSObject * target, void * reference, @@ -153,7 +157,7 @@ struct IOExternalMethodDispatch }; enum { -#define IO_EXTERNAL_METHOD_ARGUMENTS_CURRENT_VERSION 1 +#define IO_EXTERNAL_METHOD_ARGUMENTS_CURRENT_VERSION 2 kIOExternalMethodArgumentsCurrentVersion = IO_EXTERNAL_METHOD_ARGUMENTS_CURRENT_VERSION }; diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 81c6478e1..60f3ec07a 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -3717,6 +3717,7 @@ void IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type ) case kPEPagingOff: ctx.PowerState = ON_STATE; ctx.MessageType = kIOMessageSystemPagingOff; + IOService::updateConsoleUsers(NULL, kIOMessageSystemPagingOff); break; default: diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 6ef0b3413..f08348272 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -115,6 +115,7 @@ const OSSymbol * gIOConsoleSessionUIDKey; const OSSymbol * gIOConsoleSessionAuditIDKey; const OSSymbol * gIOConsoleUsersSeedKey; const OSSymbol * gIOConsoleSessionOnConsoleKey; +const OSSymbol * gIOConsoleSessionLoginDoneKey; const OSSymbol * gIOConsoleSessionSecureInputPIDKey; const OSSymbol * gIOConsoleSessionScreenLockedTimeKey; @@ -322,6 +323,7 @@ void IOService::initialize( void ) gIOConsoleUsersSeedKey = OSSymbol::withCStringNoCopy(kIOConsoleUsersSeedKey); gIOConsoleSessionOnConsoleKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionOnConsoleKey); + gIOConsoleSessionLoginDoneKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionLoginDoneKey); gIOConsoleSessionSecureInputPIDKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionSecureInputPIDKey); gIOConsoleSessionScreenLockedTimeKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionScreenLockedTimeKey); @@ -361,6 +363,8 @@ void IOService::initialize( void ) gIOConsoleLockCallout = thread_call_allocate(&IOService::consoleLockTimer, NULL); + IORegistryEntry::getRegistryRoot()->setProperty(gIOConsoleLockedKey, kOSBooleanTrue); + assert( gIOServiceBusyLock && gJobs && gJobsLock && gIOConsoleUsersLock && gIOConsoleLockCallout && (err == KERN_SUCCESS) ); @@ -4234,6 +4238,7 @@ void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessa IORegistryEntry * regEntry; OSObject * locked = kOSBooleanFalse; uint32_t idx; + bool loggedIn; bool publish; OSDictionary * user; static IOMessage sSystemPower; @@ -4246,38 +4251,45 @@ void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessa { sSystemPower = systemMessage; } + loggedIn = false; if (consoleUsers) { OSNumber * num = 0; for (idx = 0; - (!num) && (user = OSDynamicCast(OSDictionary, consoleUsers->getObject(idx))); + (user = OSDynamicCast(OSDictionary, consoleUsers->getObject(idx))); idx++) { - num = OSDynamicCast(OSNumber, user->getObject(gIOConsoleSessionScreenLockedTimeKey)); + loggedIn |= ((kOSBooleanTrue == user->getObject(gIOConsoleSessionOnConsoleKey)) + && (kOSBooleanTrue == user->getObject(gIOConsoleSessionLoginDoneKey))); + if (!num) + { + num = OSDynamicCast(OSNumber, user->getObject(gIOConsoleSessionScreenLockedTimeKey)); + } } gIOConsoleLockTime = num ? num->unsigned32BitValue() : 0; } - if (gIOConsoleLockTime) + if (!loggedIn + || (kIOMessageSystemWillSleep == sSystemPower) + || (kIOMessageSystemPagingOff == sSystemPower)) { - if (kIOMessageSystemWillSleep == sSystemPower) - locked = kOSBooleanTrue; + locked = kOSBooleanTrue; + } + else if (gIOConsoleLockTime) + { + clock_sec_t now; + clock_usec_t microsecs; + + clock_get_calendar_microtime(&now, µsecs); + if (gIOConsoleLockTime > now) + { + AbsoluteTime deadline; + clock_interval_to_deadline(gIOConsoleLockTime - now, kSecondScale, &deadline); + thread_call_enter_delayed(gIOConsoleLockCallout, deadline); + } else { - clock_sec_t now; - clock_usec_t microsecs; - - clock_get_calendar_microtime(&now, µsecs); - if (gIOConsoleLockTime > now) - { - AbsoluteTime deadline; - clock_interval_to_deadline(gIOConsoleLockTime - now, kSecondScale, &deadline); - thread_call_enter_delayed(gIOConsoleLockCallout, deadline); - } - else - { - locked = kOSBooleanTrue; - } + locked = kOSBooleanTrue; } } diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index f031afd66..32ce10c0f 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -2094,7 +2094,7 @@ kern_return_t is_io_registry_entry_get_registry_entry_id( // Create a vm_map_copy_t or kalloc'ed data for memory // to be copied out. ipc will free after the copyout. -static kern_return_t copyoutkdata( void * data, vm_size_t len, +static kern_return_t copyoutkdata( const void * data, vm_size_t len, io_buf_ptr_t * buf ) { kern_return_t err; @@ -2774,6 +2774,97 @@ kern_return_t is_io_connect_set_properties( return( is_io_registry_entry_set_properties( connection, properties, propertiesCnt, result )); } +/* Routine io_user_client_method */ +kern_return_t is_io_connect_method_var_output +( + io_connect_t connection, + uint32_t selector, + io_scalar_inband64_t scalar_input, + mach_msg_type_number_t scalar_inputCnt, + io_struct_inband_t inband_input, + mach_msg_type_number_t inband_inputCnt, + mach_vm_address_t ool_input, + mach_vm_size_t ool_input_size, + io_struct_inband_t inband_output, + mach_msg_type_number_t *inband_outputCnt, + io_scalar_inband64_t scalar_output, + mach_msg_type_number_t *scalar_outputCnt, + io_buf_ptr_t *var_output, + mach_msg_type_number_t *var_outputCnt +) +{ + CHECK( IOUserClient, connection, client ); + + IOExternalMethodArguments args; + IOReturn ret; + IOMemoryDescriptor * inputMD = 0; + OSObject * structureVariableOutputData = 0; + + bzero(&args.__reserved[0], sizeof(args.__reserved)); + args.version = kIOExternalMethodArgumentsCurrentVersion; + + args.selector = selector; + + args.asyncWakePort = MACH_PORT_NULL; + args.asyncReference = 0; + args.asyncReferenceCount = 0; + args.structureVariableOutputData = &structureVariableOutputData; + + args.scalarInput = scalar_input; + args.scalarInputCount = scalar_inputCnt; + args.structureInput = inband_input; + args.structureInputSize = inband_inputCnt; + + if (ool_input) + inputMD = IOMemoryDescriptor::withAddressRange(ool_input, ool_input_size, + kIODirectionOut, current_task()); + + args.structureInputDescriptor = inputMD; + + args.scalarOutput = scalar_output; + args.scalarOutputCount = *scalar_outputCnt; + args.structureOutput = inband_output; + args.structureOutputSize = *inband_outputCnt; + args.structureOutputDescriptor = NULL; + args.structureOutputDescriptorSize = 0; + + IOStatisticsClientCall(); + ret = client->externalMethod( selector, &args ); + + *scalar_outputCnt = args.scalarOutputCount; + *inband_outputCnt = args.structureOutputSize; + + if (var_outputCnt && var_output && (kIOReturnSuccess == ret)) + { + OSSerialize * serialize; + OSData * data; + vm_size_t len; + + if ((serialize = OSDynamicCast(OSSerialize, structureVariableOutputData))) + { + len = serialize->getLength(); + *var_outputCnt = len; + ret = copyoutkdata(serialize->text(), len, var_output); + } + else if ((data = OSDynamicCast(OSData, structureVariableOutputData))) + { + len = data->getLength(); + *var_outputCnt = len; + ret = copyoutkdata(data->getBytesNoCopy(), len, var_output); + } + else + { + ret = kIOReturnUnderrun; + } + } + + if (inputMD) + inputMD->release(); + if (structureVariableOutputData) + structureVariableOutputData->release(); + + return (ret); +} /* Routine io_user_client_method */ kern_return_t is_io_connect_method @@ -2791,7 +2882,7 @@ kern_return_t is_io_connect_method io_scalar_inband64_t scalar_output, mach_msg_type_number_t *scalar_outputCnt, mach_vm_address_t ool_output, - mach_vm_size_t * ool_output_size + mach_vm_size_t *ool_output_size ) { CHECK( IOUserClient, connection, client ); @@ -2806,9 +2897,10 @@ kern_return_t is_io_connect_method args.selector = selector; - args.asyncWakePort = MACH_PORT_NULL; - args.asyncReference = 0; - args.asyncReferenceCount = 0; + args.asyncWakePort = MACH_PORT_NULL; + args.asyncReference = 0; + args.asyncReferenceCount = 0; + args.structureVariableOutputData = 0; args.scalarInput = scalar_input; args.scalarInputCount = scalar_inputCnt; @@ -2826,16 +2918,16 @@ kern_return_t is_io_connect_method args.structureOutput = inband_output; args.structureOutputSize = *inband_outputCnt; - if (ool_output) + if (ool_output && ool_output_size) { outputMD = IOMemoryDescriptor::withAddressRange(ool_output, *ool_output_size, kIODirectionIn, current_task()); } args.structureOutputDescriptor = outputMD; - args.structureOutputDescriptorSize = *ool_output_size; + args.structureOutputDescriptorSize = ool_output_size ? *ool_output_size : 0; - IOStatisticsClientCall(); + IOStatisticsClientCall(); ret = client->externalMethod( selector, &args ); *scalar_outputCnt = args.scalarOutputCount; diff --git a/osfmk/device/device.defs b/osfmk/device/device.defs index 2e39dc559..458d89540 100644 --- a/osfmk/device/device.defs +++ b/osfmk/device/device.defs @@ -681,6 +681,20 @@ routine io_registry_entry_get_registry_entry_id( out entry_id : uint64_t ); +routine io_connect_method_var_output( + connection : io_connect_t; + in selector : uint32_t; + + in scalar_input : io_scalar_inband64_t; + in inband_input : io_struct_inband_t; + in ool_input : mach_vm_address_t; + in ool_input_size : mach_vm_size_t; + + out inband_output : io_struct_inband_t, CountInOut; + out scalar_output : io_scalar_inband64_t, CountInOut; + out var_output : io_buf_ptr_t, physicalcopy + ); + #endif /* IOKIT */ /* vim: set ft=c : */ diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 3f94ba02b..ef38300e4 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -489,10 +489,9 @@ usld_lock_common_checks( if (l == USIMPLE_LOCK_NULL) panic("%s: null lock pointer", caller); if (l->lock_type != USLOCK_TAG) - panic("%s: 0x%p is not a usimple lock", caller, l); + panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type); if (!(l->debug.state & USLOCK_INIT)) - panic("%s: %p is not an initialized lock", - caller, l); + panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state); return USLOCK_CHECKING(l); } diff --git a/osfmk/i386/rtclock_native.c b/osfmk/i386/rtclock_native.c index 5ffaf91d8..4d17039a2 100644 --- a/osfmk/i386/rtclock_native.c +++ b/osfmk/i386/rtclock_native.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,21 +70,6 @@ deadline_to_decrementer( } } -static inline uint64_t -_absolutetime_to_tsc(uint64_t ns) -{ - uint32_t generation; - uint64_t tsc; - - do { - generation = pal_rtc_nanotime_info.generation; - tsc = tmrCvt(ns - pal_rtc_nanotime_info.ns_base, tscFCvtn2t) - + pal_rtc_nanotime_info.tsc_base; - } while (generation == 0 || - generation != pal_rtc_nanotime_info.generation); - - return tsc; -} /* * Regular local APIC timer case: @@ -125,14 +110,19 @@ rtc_lapic_config_tsc_deadline_timer(void) static uint64_t rtc_lapic_set_tsc_deadline_timer(uint64_t deadline, uint64_t now) { - uint64_t set = 0; + uint64_t delta; + uint64_t delta_tsc; + uint64_t tsc = rdtsc64(); + uint64_t set = 0; if (deadline > 0) { /* * Convert to TSC */ - set = now + deadline_to_decrementer(deadline, now); - lapic_set_tsc_deadline_timer(_absolutetime_to_tsc(set)); + delta = deadline_to_decrementer(deadline, now); + set = now + delta; + delta_tsc = tmrCvt(delta, tscFCvtn2t); + lapic_set_tsc_deadline_timer(tsc + delta_tsc); } else { lapic_set_tsc_deadline_timer(0); } @@ -140,7 +130,7 @@ rtc_lapic_set_tsc_deadline_timer(uint64_t deadline, uint64_t now) KERNEL_DEBUG_CONSTANT( DECR_SET_TSC_DEADLINE | DBG_FUNC_NONE, now, deadline, - rdtsc64(), lapic_get_tsc_deadline_timer(), + tsc, lapic_get_tsc_deadline_timer(), 0); return set; diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 15b742050..90542a946 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -602,6 +602,7 @@ get_sched_statistics( out->ps_timer_pop_count = stats->timer_pop_count; out->ps_runq_count_sum = SCHED(processor_runq_stats_count_sum)(processor); out->ps_idle_transitions = stats->idle_transitions; + out->ps_quantum_timer_expirations = stats->quantum_timer_expirations; out++; processor = processor->processor_list; diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 74b90dfa8..3273a4f6c 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -92,6 +92,8 @@ thread_quantum_expire( thread_t thread = p1; ast_t preempt; + SCHED_STATS_QUANTUM_TIMER_EXPIRATION(processor); + thread_lock(thread); /* @@ -157,8 +159,8 @@ thread_quantum_expire( thread->last_quantum_refill_time = processor->quantum_end; processor->quantum_end += thread->current_quantum; - timer_call_enter1(&processor->quantum_timer, - thread, processor->quantum_end, 0); + timer_call_enter1(&processor->quantum_timer, thread, + processor->quantum_end, TIMER_CALL_CRITICAL); /* * Context switch check. diff --git a/osfmk/kern/processor_data.h b/osfmk/kern/processor_data.h index eda5bcce5..7b01b11a4 100644 --- a/osfmk/kern/processor_data.h +++ b/osfmk/kern/processor_data.h @@ -51,6 +51,7 @@ struct processor_sched_statistics { uint32_t ipi_count; uint32_t timer_pop_count; uint32_t idle_transitions; + uint32_t quantum_timer_expirations; }; struct processor_data { @@ -124,6 +125,13 @@ MACRO_BEGIN \ } \ MACRO_END +#define SCHED_STATS_QUANTUM_TIMER_EXPIRATION(p) \ +MACRO_BEGIN \ + if (__builtin_expect(sched_stats_active, 0)) { \ + (p)->processor_data.sched_stats.quantum_timer_expirations++; \ + } \ +MACRO_END + #endif /* MACH_KERNEL_PRIVATE */ #endif /* _KERN_PROCESSOR_DATA_H_ */ diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index c73ef0f3d..d7b959249 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -1829,7 +1829,7 @@ thread_select_idle( thread->last_quantum_refill_time = processor->last_dispatch; processor->quantum_end = processor->last_dispatch + thread->current_quantum; - timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, 0); + timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_CRITICAL); processor->timeslice = 1; thread->computation_epoch = processor->last_dispatch; @@ -2320,7 +2320,7 @@ thread_dispatch( * Set up quantum timer and timeslice. */ processor->quantum_end = (processor->last_dispatch + self->current_quantum); - timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, 0); + timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_CRITICAL); processor->timeslice = 1; diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index e45f99f17..3b1f0edaf 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -356,7 +356,7 @@ thread_depress_abstime( if (interval != 0) { clock_absolutetime_interval_to_deadline(interval, &deadline); - if (!timer_call_enter(&self->depress_timer, deadline, 0)) + if (!timer_call_enter(&self->depress_timer, deadline, TIMER_CALL_CRITICAL)) self->depress_timer_active++; } } @@ -453,7 +453,7 @@ thread_poll_yield( self->sched_flags |= TH_SFLAG_POLLDEPRESS; abstime += (total_computation >> sched_poll_yield_shift); - if (!timer_call_enter(&self->depress_timer, abstime, 0)) + if (!timer_call_enter(&self->depress_timer, abstime, TIMER_CALL_CRITICAL)) self->depress_timer_active++; thread_unlock(self); diff --git a/osfmk/mach/host_info.h b/osfmk/mach/host_info.h index 9ad10eaa7..d90f3be37 100644 --- a/osfmk/mach/host_info.h +++ b/osfmk/mach/host_info.h @@ -258,7 +258,7 @@ struct _processor_statistics_np { uint64_t ps_runq_count_sum __attribute((aligned(8))); uint32_t ps_idle_transitions; - + uint32_t ps_quantum_timer_expirations; }; #endif /* PRIVATE */ diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 1fe35f53f..2e1fbe691 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -6624,6 +6624,52 @@ vm_map_copy_overwrite_aligned( continue; } + if (entry->alias >= VM_MEMORY_MALLOC && + entry->alias <= VM_MEMORY_MALLOC_LARGE_REUSED) { + vm_object_t new_object, new_shadow; + + /* + * We're about to map something over a mapping + * established by malloc()... + */ + new_object = copy_entry->object.vm_object; + if (new_object != VM_OBJECT_NULL) { + vm_object_lock_shared(new_object); + } + while (new_object != VM_OBJECT_NULL && + new_object->internal) { + new_shadow = new_object->shadow; + if (new_shadow == VM_OBJECT_NULL) { + break; + } + vm_object_lock_shared(new_shadow); + vm_object_unlock(new_object); + new_object = new_shadow; + } + if (new_object != VM_OBJECT_NULL) { + if (!new_object->internal) { + /* + * The new mapping is backed + * by an external object. We + * don't want malloc'ed memory + * to be replaced with such a + * non-anonymous mapping, so + * let's go off the optimized + * path... + */ + vm_object_unlock(new_object); + goto slow_copy; + } + vm_object_unlock(new_object); + } + /* + * The new mapping is still backed by + * anonymous (internal) memory, so it's + * OK to substitute it for the original + * malloc() mapping. + */ + } + if (old_object != VM_OBJECT_NULL) { if(entry->is_sub_map) { if(entry->use_pmap) { @@ -6701,16 +6747,41 @@ vm_map_copy_overwrite_aligned( tmp_entry = tmp_entry->vme_next; } else { vm_map_version_t version; - vm_object_t dst_object = entry->object.vm_object; - vm_object_offset_t dst_offset = entry->offset; + vm_object_t dst_object; + vm_object_offset_t dst_offset; kern_return_t r; + slow_copy: + dst_object = entry->object.vm_object; + dst_offset = entry->offset; + /* * Take an object reference, and record * the map version information so that the * map can be safely unlocked. */ + if (dst_object == VM_OBJECT_NULL) { + /* + * We would usually have just taken the + * optimized path above if the destination + * object has not been allocated yet. But we + * now disable that optimization if the copy + * entry's object is not backed by anonymous + * memory to avoid replacing malloc'ed + * (i.e. re-usable) anonymous memory with a + * not-so-anonymous mapping. + * So we have to handle this case here and + * allocate a new VM object for this map entry. + */ + dst_object = vm_object_allocate( + entry->vme_end - entry->vme_start); + dst_offset = 0; + entry->object.vm_object = dst_object; + entry->offset = dst_offset; + + } + vm_object_reference(dst_object); /* account for unlock bumping up timestamp */ diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 69a5c542d..2dadb540c 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -1051,9 +1051,10 @@ pmap_destroy(pmap_t p) * physically on the right pmap: */ PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL); + if (pmap_pcid_ncpus) + pmap_destroy_pcid_sync(p); } - if (pmap_pcid_ncpus) - pmap_destroy_pcid_sync(p); + PMAP_UNLOCK(p); if (c != 0) {