From f681061dea85f03cc89f3ce82e7b66bdbd41382b Mon Sep 17 00:00:00 2001
From: Darwin <opensource@apple.com>
Date: Thu, 2 Feb 2012 16:16:40 +0000
Subject: [PATCH] xnu-1699.24.23

Imported from https://opensource.apple.com/tarballs/xnu/xnu-1699.24.23.tar.gz
---
 bsd/crypto/aes/gen/aesopt.h          |   17 -
 bsd/crypto/aes/i386/aes_modes_hw.s   | 3245 +++++++++++++-------------
 bsd/crypto/aes/test/ReadMe.txt       |   97 -
 bsd/crypto/aes/test/makegenx86.sh    |    8 -
 bsd/crypto/aes/test/makeoptx86.sh    |   10 -
 bsd/crypto/aes/test/tstaes.c         |  131 --
 bsd/hfs/hfs_cnode.c                  |    4 +-
 bsd/hfs/hfs_vfsops.c                 |    6 +
 bsd/kern/kern_sysctl.c               |    6 +
 bsd/kern/mach_process.c              |    4 -
 bsd/kern/uipc_syscalls.c             |   13 +-
 bsd/libkern/libkern.h                |    9 -
 bsd/net/ntstat.c                     |   11 +-
 bsd/netinet/in_cksum.c               |   33 -
 bsd/netinet6/esp_input.c             |    4 +-
 bsd/netinet6/in6_cksum.c             |   86 -
 bsd/nfs/nfs_vfsops.c                 |    8 +-
 bsd/vfs/vfs_cluster.c                |    2 +-
 config/MasterVersion                 |    2 +-
 kgmacros                             |   17 +-
 libkern/libkern/c++/OSMetaClass.h    |    2 -
 libsyscall/wrappers/remove-counter.c |    8 -
 osfmk/i386/i386_lock.s               |    2 -
 osfmk/vm/vm_map.c                    |  156 +-
 osfmk/vm/vm_map.h                    |   13 +
 osfmk/vm/vm_map_store.c              |    1 +
 osfmk/vm/vm_object.c                 |    4 +
 osfmk/vm/vm_pageout.c                |   73 +-
 osfmk/vm/vm_user.c                   |   39 +-
 osfmk/x86_64/idt64.s                 |    5 +-
 security/mac_base.c                  |   23 -
 31 files changed, 1931 insertions(+), 2108 deletions(-)
 delete mode 100644 bsd/crypto/aes/test/ReadMe.txt
 delete mode 100755 bsd/crypto/aes/test/makegenx86.sh
 delete mode 100755 bsd/crypto/aes/test/makeoptx86.sh
 delete mode 100644 bsd/crypto/aes/test/tstaes.c

diff --git a/bsd/crypto/aes/gen/aesopt.h b/bsd/crypto/aes/gen/aesopt.h
index fc28e4a48..a00794865 100644
--- a/bsd/crypto/aes/gen/aesopt.h
+++ b/bsd/crypto/aes/gen/aesopt.h
@@ -283,9 +283,6 @@
     assembler code routines for encryption and decryption with the C code
     only providing key scheduling
 */
-#if 0 && !defined(AES_ASM)
-#define AES_ASM
-#endif
 
 /*  3. BYTE ORDER WITHIN 32 BIT WORDS
 
@@ -316,15 +313,7 @@
 
     NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set
 */
-#if 1 || defined(AES_ASM)
 #define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
-#elif 0
-#define ALGORITHM_BYTE_ORDER BRG_LITTLE_ENDIAN
-#elif 0
-#define ALGORITHM_BYTE_ORDER BRG_BIG_ENDIAN
-#else
-#error The algorithm byte order is not defined
-#endif
 
 /*  4. FAST INPUT/OUTPUT OPERATIONS.
 
@@ -342,9 +331,6 @@
     assumed that access to byte arrays as if they are arrays of 32-bit
     words will not cause problems when such accesses are misaligned.
 */
-#if 0 && !defined(_MSC_VER)
-#define SAFE_IO
-#endif
 
 /*  5. LOOP UNROLLING
 
@@ -429,9 +415,6 @@
     it seems to sometimes cause trouble for the VC++ version 6 compiler.
 */
 
-#if 0 && defined(_MSC_VER) && (_MSC_VER >= 1300)
-#define TABLE_ALIGN 64
-#endif
 
 /*  10. INTERNAL TABLE CONFIGURATION
 
diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s
index c9702eaec..b9e35085c 100644
--- a/bsd/crypto/aes/i386/aes_modes_hw.s
+++ b/bsd/crypto/aes/i386/aes_modes_hw.s
@@ -1,1622 +1,1623 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
-
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
-
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
-
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- ALTERNATIVELY, provided that this notice is retained in full, this product
- may be distributed under the terms of the GNU General Public License (GPL),
- in which case the provisions of the GPL apply INSTEAD OF those given above.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue 31/01/2006
-
- These subroutines implement multiple block AES modes for ECB, CBC, CFB,
- OFB and CTR encryption,  The code provides support for the VIA Advanced 
- Cryptography Engine (ACE).
-
- NOTE: In the following subroutines, the AES contexts (ctx) must be
- 16 byte aligned if VIA ACE is being used
-*/
-
-/* ---------------------------------------------------------------------------------------------------------------- 
-
-	aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
-
-	For simplicity, I am assuming all variables are in 128-bit data type.
-
-	aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
-	{
-		while(num_blk--) {
-			*iv ^= *ibuf++;
-			aes_encrypt(iv, iv, ctx);
-			*obuf++ = *iv;
-		}
-		return 0;
-	}
-
-	The following is an implementation of this function using Intel AESNI.
-	This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. 
-	Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
-	to this aesni-based function should it detecs that aesni is available.
-	Blindly call this function SURELY will cause a CRASH on systems with no aesni support. 
-
-	Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
-	are serially chained. This prevents us from arranging several blocks for encryption in parallel.
-
-   ----------------------------------------------------------------------------------------------------------------*/
-
-	.text
-	.align	4,0x90
-	.globl	_aes_encrypt_cbc_hw
-_aes_encrypt_cbc_hw:
-
-	// push/save registers for local use
-#if	defined	__i386__
-
-	push	%ebp
-	movl	%esp, %ebp
-	push	%ebx
-	push	%edi
-
-	#define	sp	%esp
-
-#else	// __x86_64__
-
-	push	%rbp
-	mov		%rsp, %rbp
-	push	%rbx
-	push	%r13
-	push	%r14
-	push	%r15
-
-	#define	sp	%rsp
-
-#endif
-
-	// if this is kernel code, need to save used xmm registers
-#ifdef	KERNEL
-
-#if defined __i386__
-	sub		$(8*16), %esp			// for possible xmm0-xmm7 save/restore
-#else
-	sub		$(16*16), %rsp		// xmm0-xmm15 save/restore	
-#endif
-
-	movaps	%xmm0, (sp)
-	movaps	%xmm1, 16(sp)
-	movaps	%xmm2, 32(sp)
-	movaps	%xmm3, 48(sp)
-	movaps	%xmm4, 64(sp)
-	movaps	%xmm5, 80(sp)
-	movaps	%xmm6, 96(sp)
-	movaps	%xmm7, 112(sp)
-#if defined	__x86_64__
-	movaps	%xmm8, 16*8(sp)
-	movaps	%xmm9, 16*9(sp)
-	movaps	%xmm10, 16*10(sp)
-	movaps	%xmm11, 16*11(sp)
-	movaps	%xmm12, 16*12(sp)
-	movaps	%xmm13, 16*13(sp)
-	movaps	%xmm14, 16*14(sp)
-	movaps	%xmm15, 16*15(sp)
-#endif	// __x86_64__
-
-#endif	// KERNEL
-
-	#define	iv	%xmm0
-
-#ifdef	__i386__
-
-	mov		12(%ebp), %eax			// in_iv
-	mov		24(%ebp), %edx			// ctx
-	movups	(%eax), iv				// iv = in_iv	
-	mov		8(%ebp), %ebx			// ibuf
-	mov		16(%ebp), %ecx			// num_blk
-	mov		20(%ebp), %edi			// obuf
-
-	#define	ibuf	%ebx
-	#define	obuf	%edi
-	#define num_blk	%ecx	
-	#define	ctx		%edx
-
-#else
-
-	mov		%rdi, %rbx				// ibuf
-	movups	(%rsi), iv				// iv = in_iv
-	mov		%rdx, %r13				// num_blk
-	mov		%rcx, %r14				// obuf
-	mov		%r8, %r15				// ctx	
-
-	#define	ibuf	%rbx
-	#define	num_blk	%r13d
-	#define	obuf	%r14	
-	#define	ctx		%r15
-
-#endif
-
-	mov		240(ctx), %eax			// aes length
-	cmp		$160, %eax				// aes-128 encrypt ?
-	je		L_encrypt_128
-	cmp		$192, %eax				// aes-192 encrypt ?
-	je		L_encrypt_192
-	cmp		$224, %eax				// aes-256 encrypt ?
-	je		L_encrypt_256
-	mov		$-1, %eax				// return error
-	jmp		L_error	
-
-	//
-	// aes-128 encrypt_cbc operation, up to L_HW_cbc_done
-	//
-
-L_encrypt_128:
-
-	cmp		$1, num_blk				// check number of block
-	jl		L_HW_cbc_done			// should it be less than 1, nothing to do
-
-	movups	(ctx), %xmm2			// key0
-	movups	16(ctx), %xmm3			// key1
-	movups	32(ctx), %xmm4			// key2
-	movups	48(ctx), %xmm5			// key3
-	movups	64(ctx), %xmm6			// key4
-	movups	80(ctx), %xmm7			// key5
-#if defined	__x86_64__
-	movups	96(ctx), %xmm8			// key6
-	movups	112(ctx), %xmm9			// key7
-	movups	128(ctx), %xmm10		// key8
-	movups	144(ctx), %xmm11		// key9
-	movups	160(ctx), %xmm12		// keyA
-#endif
-
-	// while (num_blk--) {
-	//			*iv ^= *ibuf++;
-	//			aes_encrypt(iv, iv, ctx);
-	//			*obuf++ = *iv;
-	// }
-0:
-	movups	(ibuf), %xmm1				// *ibuf
-	pxor    %xmm2, iv					// 1st instruction inside aes_encrypt
-	pxor	%xmm1, iv					// *iv ^= *ibuf
-
-	// finishing up the rest of aes_encrypt
-    aesenc  %xmm3, iv
-    aesenc  %xmm4, iv
-    aesenc  %xmm5, iv
-    aesenc  %xmm6, iv
-    aesenc  %xmm7, iv
-#if defined	__x86_64__
-    aesenc  %xmm8, iv
-    aesenc  %xmm9, iv
-    aesenc  %xmm10, iv
-    aesenc  %xmm11, iv
-    aesenclast  %xmm12, iv
-#else
-	movups	96(ctx), %xmm1				// key6
-    aesenc  %xmm1, iv
-	movups	112(ctx), %xmm1				// key7
-    aesenc  %xmm1, iv
-	movups	128(ctx), %xmm1				// key8
-    aesenc  %xmm1, iv
-	movups	144(ctx), %xmm1				// key9
-    aesenc  %xmm1, iv
-	movups	160(ctx), %xmm1				// keyA
-    aesenclast  %xmm1, iv
-#endif
-
-	movups	iv, (obuf)					// *obuf = *iv;
-	add		$16, obuf					// obuf++;
-	add		$16, ibuf					// ibuf++;
-	sub		$1, num_blk					// num_blk --
-	jg		0b							// if num_blk > 0, repeat the loop
-
-	// the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
-
-L_HW_cbc_done:
-
-	xor		%eax, %eax				// to return CRYPT_OK
-
-L_error:
-
-	// if kernel, restore xmm registers
-#ifdef	KERNEL 
-	movaps	0(sp), %xmm0
-	movaps	16(sp), %xmm1
-	movaps	32(sp), %xmm2
-	movaps	48(sp), %xmm3
-	movaps	64(sp), %xmm4
-	movaps	80(sp), %xmm5
-	movaps	96(sp), %xmm6
-	movaps	112(sp), %xmm7
-#if defined	__x86_64__
-	movaps	16*8(sp), %xmm8
-	movaps	16*9(sp), %xmm9
-	movaps	16*10(sp), %xmm10
-	movaps	16*11(sp), %xmm11
-	movaps	16*12(sp), %xmm12
-	movaps	16*13(sp), %xmm13
-	movaps	16*14(sp), %xmm14
-	movaps	16*15(sp), %xmm15
-#endif	// __x86_64__
-#endif	// KERNEL
-
-	// release used stack memory, restore used callee-saved registers, and return 
-#if	defined	__i386__
-#ifdef	KERNEL
-	add		$(8*16), %esp
-#endif
-	pop		%edi
-	pop		%ebx
-#else
-#ifdef	KERNEL
-	add		$(16*16), %rsp	
-#endif
-	pop		%r15
-	pop		%r14
-	pop		%r13
-	pop		%rbx
-#endif
-	leave
-	ret
-
-	//
-	// aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
-	//
-
-L_encrypt_192:
-
-	cmp		$1, num_blk				// check number of block
-	jl		L_HW_cbc_done			// should it be less than 1, nothing to do
-
-	movups	(ctx), %xmm2			// key0
-	movups	16(ctx), %xmm3			// key1
-	movups	32(ctx), %xmm4			// key2
-	movups	48(ctx), %xmm5			// key3
-	movups	64(ctx), %xmm6			// key4
-	movups	80(ctx), %xmm7			// key5
-#if defined	__x86_64__
-	movups	96(ctx), %xmm8			// key6
-	movups	112(ctx), %xmm9			// key7
-	movups	128(ctx), %xmm10		// key8
-	movups	144(ctx), %xmm11		// key9
-	movups	160(ctx), %xmm12		// keyA
-	movups	176(ctx), %xmm13		// keyB
-	movups	192(ctx), %xmm14		// keyC
-#endif
-	
-	// while (num_blk--) {
-	//			*iv ^= *ibuf++;
-	//			aes_encrypt(iv, iv, ctx);
-	//			*obuf++ = *iv;
-	// }
-0:
-	movups	(ibuf), %xmm1			// *ibuf
-	pxor	%xmm1, iv				// *iv ^= ibuf
-
-	// aes_encrypt(iv, iv, ctx);
-
-	pxor    %xmm2, iv
-    aesenc  %xmm3, iv
-    aesenc  %xmm4, iv
-    aesenc  %xmm5, iv
-    aesenc  %xmm6, iv
-    aesenc  %xmm7, iv
-#if defined	__x86_64__
-    aesenc  %xmm8, iv
-    aesenc  %xmm9, iv
-    aesenc  %xmm10, iv
-    aesenc  %xmm11, iv
-    aesenc  %xmm12, iv
-    aesenc  %xmm13, iv
-    aesenclast  %xmm14, iv
-#else
-	movups	96(ctx), %xmm1
-    aesenc  %xmm1, iv
-	movups	112(ctx), %xmm1
-    aesenc  %xmm1, iv
-	movups	128(ctx), %xmm1
-    aesenc  %xmm1, iv
-	movups	144(ctx), %xmm1
-    aesenc  %xmm1, iv
-	movups	160(ctx), %xmm1
-    aesenc  %xmm1, iv
-	movups	176(ctx), %xmm1
-    aesenc  %xmm1, iv
-	movups	192(ctx), %xmm1
-    aesenclast  %xmm1, iv
-#endif
-
-	movups	iv, (obuf)				// *obuf = *iv;
-	add		$16, ibuf				// ibuf++
-	add		$16, obuf				// obuf++
-
-	sub		$1, num_blk				// num_blk --
-	jg		0b						// if num_blk > 0, repeat the loop
-
-	jmp		L_HW_cbc_done			// share with the common exit code
-
-	//
-	// aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
-	//
-
-L_encrypt_256:
-
-	cmp		$1, num_blk				// check number of block
-	jl		L_HW_cbc_done			// should it be less than 1, nothing to do
-
-	movups	(ctx), %xmm2			// key0
-	movups	16(ctx), %xmm3			// key1
-	movups	32(ctx), %xmm4			// key2
-	movups	48(ctx), %xmm5			// key3
-	movups	64(ctx), %xmm6			// key4
-	movups	80(ctx), %xmm7			// key5
-#if defined	__x86_64__
-	movups	96(ctx), %xmm8			// key6
-	movups	112(ctx), %xmm9			// key7
-	movups	128(ctx), %xmm10		// key8
-	movups	144(ctx), %xmm11		// key9
-	movups	160(ctx), %xmm12		// keyA
-	movups	176(ctx), %xmm13		// keyB
-	movups	192(ctx), %xmm14		// keyC
-	movups	208(ctx), %xmm15		// keyD
-	// movups	224(ctx), %xmm1		// keyE
-#endif
-
-	// while (num_blk--) {
-	//			*iv ^= *ibuf++;
-	//			aes_encrypt(iv, iv, ctx);
-	//			*obuf++ = *iv;
-	// }
-0:
-	movups	(ibuf), %xmm1			// *ibuf
-	pxor	%xmm1, iv				// *iv ^= ibuf
-	
-	// aes_encrypt(iv, iv, ctx);
-	pxor    %xmm2, iv
-    aesenc  %xmm3, iv
-    aesenc  %xmm4, iv
-    aesenc  %xmm5, iv
-    aesenc  %xmm6, iv
-    aesenc  %xmm7, iv
-#if defined	__x86_64__
-	movups	224(ctx), %xmm1			// keyE
-    aesenc  %xmm8, iv
-    aesenc  %xmm9, iv
-    aesenc  %xmm10, iv
-    aesenc  %xmm11, iv
-    aesenc  %xmm12, iv
-    aesenc  %xmm13, iv
-    aesenc  %xmm14, iv
-    aesenc  %xmm15, iv
-    aesenclast  %xmm1, iv
-#else
-	movups	96(ctx), %xmm1			// key6
-    aesenc  %xmm1, iv
-	movups	112(ctx), %xmm1			// key7
-    aesenc  %xmm1, iv
-	movups	128(ctx), %xmm1			// key8
-    aesenc  %xmm1, iv
-	movups	144(ctx), %xmm1			// key9
-    aesenc  %xmm1, iv
-	movups	160(ctx), %xmm1			// keyA
-    aesenc  %xmm1, iv
-	movups	176(ctx), %xmm1			// keyB
-    aesenc  %xmm1, iv
-	movups	192(ctx), %xmm1			// keyC
-    aesenc  %xmm1, iv
-	movups	208(ctx), %xmm1			// keyD
-    aesenc  %xmm1, iv
-	movups	224(ctx), %xmm1			// keyE
-    aesenclast  %xmm1, iv
-#endif
-
-	movups	iv, (obuf)				// *obuf = *iv;
-	add		$16, ibuf				// ibuf++
-	add		$16, obuf				// obuf++
-
-	sub		$1, num_blk				// num_blk --
-	jg		0b						// if num_blk > 0, repeat the loop
-
-	jmp		L_HW_cbc_done			// share with the common exit code
-
-
-
-	//
-	// --------- END of aes_encrypt_cbc_hw  -------------------
-	//
-
-
-/* ---------------------------------------------------------------------------------------------------------------- 
-
-	aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
-
-	For simplicity, I am assuming all variables are in 128-bit data type.
-
-	aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
-	{
-		while(num_blk--) {
-			aes_decrypt(ibuf, obuf, ctx);
-			*obuf++ ^= *iv;
-			*iv = *ibuf++;
-		}
-		return 0;
-	}
-
-	The following is an implementation of this function using Intel AESNI.
-	This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. 
-	Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
-	to this aesni-based function should it detecs that aesni is available.
-	Blindly call this function SURELY will cause a CRASH on systems with no aesni support. 
-
-	Note that the decryption operation is not related over blocks.
-	This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
-	This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
-	The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
-
-	Example C code for packing 4 blocks in an iteration is shown as follows:
-
-		while ((num_blk-=4)>=0) {
-
-			// the following 4 functions can be interleaved to exploit parallelism
-			aes_decrypt(ibuf, obuf, ctx);
-			aes_decrypt(ibuf+1, obuf+1, ctx);
-			aes_decrypt(ibuf+2, obuf+2, ctx);
-			aes_decrypt(ibuf+3, obuf+3, ctx);
-
-			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
-			*iv = ibuf[3];		ibuf += 4; 	obuf += 4;
-		}
-		num_blk+=4;
-
-   ----------------------------------------------------------------------------------------------------------------*/
-
-	.text
-	.align	4,0x90
-	.globl	_aes_decrypt_cbc_hw
-_aes_decrypt_cbc_hw:
-
-	// push/save registers for local use
-#if	defined	__i386__
-
-	push	%ebp
-	movl	%esp, %ebp
-	push	%ebx					// ibuf
-	push	%edi					// obuf
-
-	#define	sp	%esp
-
-#else	// __x86_64__
-
-	push	%rbp
-	mov		%rsp, %rbp
-	push	%rbx
-	push	%r13
-	push	%r14
-	push	%r15
-
-	#define	sp	%rsp
-
-#endif
-
-
-	// if kernel, allocate stack space to save xmm registers
-#ifdef	KERNEL
-#if defined __i386__
-	sub		$(8*16), %esp
-#else
-	sub		$(16*16), %rsp
-#endif
-	movaps	%xmm0, (sp)
-	movaps	%xmm1, 16(sp)
-	movaps	%xmm2, 32(sp)
-	movaps	%xmm3, 48(sp)
-	movaps	%xmm4, 64(sp)
-	movaps	%xmm5, 80(sp)
-	movaps	%xmm6, 96(sp)
-	movaps	%xmm7, 112(sp)
-#if defined	__x86_64__
-	movaps	%xmm8, 16*8(sp)
-	movaps	%xmm9, 16*9(sp)
-	movaps	%xmm10, 16*10(sp)
-	movaps	%xmm11, 16*11(sp)
-	movaps	%xmm12, 16*12(sp)
-	movaps	%xmm13, 16*13(sp)
-	movaps	%xmm14, 16*14(sp)
-	movaps	%xmm15, 16*15(sp)
-#endif	// __x86_64__
-#endif
-
-	#undef	iv
-	#define	iv	%xmm0
-
-#if defined	__i386__
-	mov		12(%ebp), %eax			// in_iv
-	mov		24(%ebp), %edx			// ctx
-	movups	(%eax), iv				// iv = in_iv	
-	mov		8(%ebp), %ebx			// ibuf
-	mov		16(%ebp), %ecx			// num_blk
-	mov		20(%ebp), %edi			// obuf
-
-	#define	ibuf	%ebx
-	#define	obuf	%edi
-	#define num_blk	%ecx	
-	#define	ctx		%edx
-
-#else	//	__x86_64__, rdi/rsi/rdx/rcx/r8
-
-	mov		%rdi, %rbx				// ibuf
-	movups	(%rsi), iv				// iv = in_iv
-	mov		%rdx, %r13				// num_blk
-	mov		%rcx, %r14				// obuf
-	mov		%r8, %r15				// ctx	
-
-	#define	ibuf	%rbx
-	#define	num_blk	%r13d
-	#define	obuf	%r14	
-	#define	ctx		%r15
-
-#endif
-
-	mov		240(ctx), %eax			// aes length
-	cmp		$160, %eax				// aes-128 decrypt
-	je		L_decrypt_128
-	cmp		$192, %eax				// aes-192 decrypt
-	je		L_decrypt_192
-	cmp		$224, %eax				// aes-256 decrypt
-	je		L_decrypt_256
-
-	mov		$-1, %eax				// wrong aes length, to return -1
-	jmp		L_error					// early exit due to wrong aes length
-
-
-	//
-	// aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
-	//
-
-L_decrypt_128:
-
-	cmp		$1, num_blk
-	jl		L_HW_cbc_done			// if num_blk < 1, early return
-
-	// aes-128 decrypt expanded keys
-	movups	160(ctx), %xmm3
-	movups	144(ctx), %xmm4
-	movups	128(ctx), %xmm5
-	movups	112(ctx), %xmm6
-	movups	96(ctx), %xmm7
-#if defined	__x86_64__
-	movups	80(ctx), %xmm8
-	movups	64(ctx), %xmm9
-	movups	48(ctx), %xmm10
-	movups	32(ctx), %xmm11
-	movups	16(ctx), %xmm12
-	movups	0(ctx), %xmm13
-#endif
-
-	// performs 4 block decryption in an iteration to exploit decrypt in parallel
-
-	//		while ((num_blk-=4)>=0) {
-	//			aes_decrypt(ibuf, obuf, ctx);
-	//			aes_decrypt(ibuf+1, obuf+1, ctx);
-	//			aes_decrypt(ibuf+2, obuf+2, ctx);
-	//			aes_decrypt(ibuf+3, obuf+3, ctx);
-	//			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
-	//			*iv = ibuf[3]; ibuf += 4; obuf += 4;
-	//		}
-
-	sub		$4, num_blk					// pre decrement num_blk by 4
-	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code
-
-0:
-
-
-#if defined	__x86_64__
-
-	movups	(ibuf), %xmm1				// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
-	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf
-	movups	48(ibuf), %xmm15			// tmp = 4th ibuf
-
-	// for x86_64, the expanded keys are already stored in xmm3-xmm13
-
-	// aes-128 decrypt round 0 per 4 blocks
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-	pxor    %xmm3, %xmm14
-	pxor    %xmm3, %xmm15
-
-	// aes-128 decrypt round 1 per 4 blocks
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm4, %xmm14
-    aesdec  %xmm4, %xmm15
-
-	// aes-128 decrypt round 2 per 4 blocks
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm5, %xmm14
-    aesdec  %xmm5, %xmm15
-
-	// aes-128 decrypt round 3 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm14
-    aesdec  %xmm6, %xmm15
-
-	// aes-128 decrypt round 4 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm14
-    aesdec  %xmm7, %xmm15
-
-	// aes-128 decrypt round 5 per 4 blocks
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm8, %xmm14
-    aesdec  %xmm8, %xmm15
-
-	// aes-128 decrypt round 6 per 4 blocks
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm9, %xmm14
-    aesdec  %xmm9, %xmm15
-
-	// aes-128 decrypt round 7 per 4 blocks
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm10, %xmm14
-    aesdec  %xmm10, %xmm15
-
-	// aes-128 decrypt round 8 per 4 blocks
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm11, %xmm14
-    aesdec  %xmm11, %xmm15
-
-	// aes-128 decrypt round 9 per 4 blocks
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-
-	// aes-128 decrypt round 10 (last) per 4 blocks
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-    aesdeclast  %xmm13, %xmm14
-    aesdeclast  %xmm13, %xmm15
-
-	pxor	iv, %xmm1				// obuf[0] ^= *iv; 
-	movups	(ibuf), iv				// ibuf[0]
-	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0]; 
-	movups	16(ibuf), iv			// ibuf[1]
-	pxor	iv, %xmm14				// obuf[2] ^= ibuf[1]; 
-	movups	32(ibuf), iv			// ibuf[2] 
-	pxor	iv, %xmm15				// obuf[3] ^= obuf[2]; 
-	movups	48(ibuf), iv			// *iv = ibuf[3]
-
-	movups	%xmm1, (obuf)			// write 1st obuf
-	movups	%xmm2, 16(obuf)			// write 2nd obuf
-	movups	%xmm14, 32(obuf)		// write 3rd obuf
-	movups	%xmm15, 48(obuf)		// write 4th obuf
-
-
-#else
-
-	// aes_decrypt_cbc per 4 blocks using aes-128 for i386
-	// xmm1/xmm2/xmm4/xmm5 used for obuf per block
-	// xmm3 = key0
-	// xmm0 = iv
-	// xmm6/xmm7 dynamically load with other expanded keys
-
-	movups	(ibuf), %xmm1			// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf
-	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf
-	movups	48(ibuf), %xmm5			// tmp = 4th ibuf
-
-	// aes_decrypt
-	// for i386, sequentially load expanded keys into xmm6/xmm7
-
-	movups	144(ctx), %xmm6			// key1
-
-	// aes-128 decrypt round 0 per 4 blocks
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-	pxor    %xmm3, %xmm4
-	pxor    %xmm3, %xmm5
-
-	movups	128(ctx), %xmm7			// key2
-
-	// aes-128 decrypt round 1 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	112(ctx), %xmm6			// key3
-
-	// aes-128 decrypt round 2 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	96(ctx), %xmm7			// key4
-
-	// aes-128 decrypt round 3 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	80(ctx), %xmm6			// key5
-
-	// aes-128 decrypt round 4 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	64(ctx), %xmm7			// key6
-
-	// aes-128 decrypt round 5 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	48(ctx), %xmm6			// key7
-
-	// aes-128 decrypt round 6 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	32(ctx), %xmm7			// key8
-
-	// aes-128 decrypt round 7 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	16(ctx), %xmm6			// key9
-
-	// aes-128 decrypt round 8 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	0(ctx), %xmm7			// keyA
-
-	// aes-128 decrypt round 9 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	// aes-128 decrypt round 10 (last) per 4 blocks
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-    aesdeclast  %xmm7, %xmm4
-    aesdeclast  %xmm7, %xmm5
-
-	pxor	iv, %xmm1				// 1st obuf ^= iv; 
-	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm2				// 2nd obuf ^= iv; 
-	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm4				// 3rd obuf ^= iv; 
-	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm5				// 4th obuf ^= iv; 
-	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-	movups	%xmm1, (obuf)			// write 1st obuf
-	movups	%xmm2, 16(obuf)			// write 2nd obuf
-	movups	%xmm4, 32(obuf)			// write 3rd obuf
-	movups	%xmm5, 48(obuf)			// write 4th obuf
-#endif
-
-	add		$64, ibuf				// ibuf += 4; 
-	add		$64, obuf				// obuf += 4;	
-
-	sub		$4, num_blk				// num_blk -= 4
-	jge		0b						// if num_blk > 0, repeat the loop
-
-9:	add		$4, num_blk				// post incremtn num_blk by 4
-	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
-
-#if defined	__i386__
-	// updated as they might be needed as expanded keys in the remaining
-	movups	144(ctx), %xmm4
-	movups	128(ctx), %xmm5
-	movups	112(ctx), %xmm6
-	movups	96(ctx), %xmm7
-#endif
-
-	test	$2, num_blk				// check whether num_blk has 2 blocks
-	je		9f						// if num_blk & 2 == 0, skip the per-pair processing code
-
-	// do the remaining 2 blocks together
-
-	movups	(ibuf), %xmm1				// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
-
-	// aes_decrypt
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-#if defined	__x86_64__
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-#else
-	movups	80(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-	movups	64(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-	movups	48(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-	movups	32(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-	movups	16(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-	movups	0(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-	movups	112(ctx), %xmm6
-	movups	96(ctx), %xmm7
-#endif
-
-	pxor	iv, %xmm1				// obuf[0] ^= *iv; 
-	movups	(ibuf), iv				// ibuf[0]
-	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0]
-	movups	16(ibuf), iv			// *iv = ibuf[1]
-
-	movups	%xmm1, (obuf)			// write obuf[0]
-	movups	%xmm2, 16(obuf)			// write obuf[1]
-
-	add		$32, ibuf				// ibuf += 2
-	add		$32, obuf				// obuf += 2
-
-9:
-	test	$1, num_blk				// check whether num_blk has residual 1 block
-	je		L_HW_cbc_done			// if num_blk == 0, no need for residual processing code
-	
-	movups	(ibuf), %xmm2				// tmp = ibuf
-	// aes_decrypt
-	pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm7, %xmm2
-#if defined	__x86_64__
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm2
-    aesdeclast  %xmm13, %xmm2
-#else
-	movups	80(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	64(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	48(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	32(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	16(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	(ctx), %xmm1
-    aesdeclast  %xmm1, %xmm2
-#endif
-
-	pxor	iv, %xmm2			// *obuf ^= *iv; 
-	movups	(ibuf), iv			// *iv = *ibuf;
-	movups	%xmm2, (obuf)		// write *obuf
-
-	jmp		L_HW_cbc_done
-
-	//
-	// aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
-	//
-
-L_decrypt_192:
-
-	cmp		$1, num_blk
-	jl		L_HW_cbc_done			// if num_blk < 1, early return
-
-	// aes-192 decryp expanded keys
-	movups	192(ctx), %xmm3
-	movups	176(ctx), %xmm4
-	movups	160(ctx), %xmm5
-	movups	144(ctx), %xmm6
-	movups	128(ctx), %xmm7
-#if defined	__x86_64__
-	movups	112(ctx), %xmm8
-	movups	96(ctx), %xmm9
-	movups	80(ctx), %xmm10
-	movups	64(ctx), %xmm11
-	movups	48(ctx), %xmm12
-	movups	32(ctx), %xmm13
-	movups	16(ctx), %xmm14
-	movups	(ctx), %xmm15
-#endif
-
-	// performs 4 block decryption in an iteration to exploit decrypt in parallel
-
-	//		while ((num_blk-=4)>=0) {
-	//			aes_decrypt(ibuf, obuf, ctx);
-	//			aes_decrypt(ibuf+1, obuf+1, ctx);
-	//			aes_decrypt(ibuf+2, obuf+2, ctx);
-	//			aes_decrypt(ibuf+3, obuf+3, ctx);
-	//			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
-	//			*iv = ibuf[3]; ibuf += 4; obuf += 4;
-	//		}
-
-	sub		$4, num_blk					// pre decrement num_blk by 4
-	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code
-0:
-
-#if defined	__x86_64__
-
-	movups	(ibuf), %xmm1				// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
-	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf
-	movups	48(ibuf), %xmm15			// tmp = 4th ibuf
-
-	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
-	// use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
-
-	// round 0 for 4 blocks
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-	pxor    %xmm3, %xmm14
-	pxor    %xmm3, %xmm15
-
-	// round 1 for 4 blocks
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm4, %xmm14
-    aesdec  %xmm4, %xmm15
-
-	// round 2 for 4 blocks
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm5, %xmm14
-    aesdec  %xmm5, %xmm15
-
-	// round 3 for 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm14
-    aesdec  %xmm6, %xmm15
-
-	// round 4 for 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm14
-    aesdec  %xmm7, %xmm15
-
-	// round 5 for 4 blocks
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm8, %xmm14
-    aesdec  %xmm8, %xmm15
-
-	// round 6 for 4 blocks
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm9, %xmm14
-    aesdec  %xmm9, %xmm15
-
-	// round 7 for 4 blocks
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm10, %xmm14
-    aesdec  %xmm10, %xmm15
-
-	// round 8 for 4 blocks
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm11, %xmm14
-    aesdec  %xmm11, %xmm15
-
-	// round 9 for 4 blocks
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-
-	movups	16(ctx), %xmm12
-
-	// round A for 4 blocks
-    aesdec  %xmm13, %xmm1
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm13, %xmm14
-    aesdec  %xmm13, %xmm15
-
-	movups	(ctx), %xmm13
-
-	// round B for 4 blocks
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-
-	movups	48(ctx), %xmm12		// restore %xmm12 to its original key
-
-	// round C (last) for 4 blocks
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-    aesdeclast  %xmm13, %xmm14
-    aesdeclast  %xmm13, %xmm15
-
-	movups	32(ctx), %xmm13		// restore %xmm13 to its original key
-
-	pxor	iv, %xmm1				// obuf[0] ^= *iv; 
-	movups	(ibuf), iv				// ibuf[0]
-	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0] 
-	movups	16(ibuf), iv			// ibuf[1]
-	pxor	iv, %xmm14				// obuf[2] ^= ibuf[1] 
-	movups	32(ibuf), iv			// ibuf[2] 
-	pxor	iv, %xmm15				// obuf[3] ^= ibuf[2] 
-	movups	48(ibuf), iv			// *iv = ibuf[3] 
-
-	movups	%xmm1, (obuf)			// write 1st obuf
-	movups	%xmm2, 16(obuf)			// write 2nd obuf
-	movups	%xmm14, 32(obuf)		// write 3rd obuf
-	movups	%xmm15, 48(obuf)		// write 4th obuf
-
-	add		$64, ibuf				// ibuf += 4; 
-	add		$64, obuf				// obuf += 4;	
-
-	sub		$4, num_blk				// num_blk -= 4
-	jge		0b						// if num_blk > 0, repeat the loop
-
-9:	add		$4, num_blk				// post incremtn num_blk by 4
-	je		L_HW_cbc_done			// if num_blk == 0, prepare to return 
-
-	movups	16(ctx), %xmm14			// restore %xmm14 to its key
-	movups	(ctx), %xmm15			// restore %xmm15 to its key
-
-#else
-
-	movups	(ibuf), %xmm1			// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf
-	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf
-	movups	48(ibuf), %xmm5			// tmp = 4th ibuf
-
-	// aes_decrypt
-	// for i386, sequentially load expanded keys into xmm6/xmm7
-	movups	176(ctx), %xmm6
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-	pxor    %xmm3, %xmm4
-	pxor    %xmm3, %xmm5
-
-	movups	160(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	144(ctx), %xmm6
-	aesdec    %xmm7, %xmm1
-	aesdec    %xmm7, %xmm2
-	aesdec    %xmm7, %xmm4
-	aesdec    %xmm7, %xmm5
-
-	movups	128(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	112(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	96(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	80(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	64(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	48(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	32(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	16(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	0(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-    aesdeclast  %xmm7, %xmm4
-    aesdeclast  %xmm7, %xmm5
-
-	pxor	iv, %xmm1				// 1st obuf ^= iv; 
-	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm2				// 2nd obuf ^= iv; 
-	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm4				// 3rd obuf ^= iv; 
-	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm5				// 4th obuf ^= iv; 
-	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-	movups	%xmm1, (obuf)			// write 1st obuf
-	movups	%xmm2, 16(obuf)			// write 2nd obuf
-	movups	%xmm4, 32(obuf)			// write 3rd obuf
-	movups	%xmm5, 48(obuf)			// write 4th obuf
-
-	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE * 4; 
-	add		$64, obuf				// obuf += AES_BLOCK_SIZE * 4;	
-
-	sub		$4, num_blk				// num_blk -= 4
-	jge		0b						// if num_blk > 0, repeat the loop
-
-
-9:	add		$4, num_blk				//	post incremtn num_blk by 4
-	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
-
-	movups	176(ctx), %xmm4
-	movups	160(ctx), %xmm5
-	movups	144(ctx), %xmm6
-	movups	128(ctx), %xmm7
-
-#endif
-
-	// per-block aes_decrypt_cbc loop
-
-0:
-	movups	(ibuf), %xmm2				// tmp = ibuf
-
-	// aes_decrypt
-	pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm7, %xmm2
-#if defined	__x86_64__
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm14, %xmm2
-    aesdeclast  %xmm15, %xmm2
-#else
-	movups	112(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	96(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	80(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	64(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	48(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	32(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	16(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	(ctx), %xmm1
-    aesdeclast  %xmm1, %xmm2
-#endif
-
-	pxor	iv, %xmm2			// obuf ^= iv; 
-	movups	(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-	movups	%xmm2, (obuf)		// write obuf
-
-	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; 
-	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	
-	sub		$1, num_blk				// num_blk --
-	jg		0b						// if num_blk > 0, repeat the loop
-
-	jmp		L_HW_cbc_done
-
-	//
-	// aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
-	//
-
-L_decrypt_256:
-
-	cmp		$1, num_blk
-	jl		L_HW_cbc_done	
-
-	movups	224(ctx), %xmm3
-	movups	208(ctx), %xmm4
-	movups	192(ctx), %xmm5
-	movups	176(ctx), %xmm6
-	movups	160(ctx), %xmm7
-#if defined	__x86_64__
-	movups	144(ctx), %xmm8
-	movups	128(ctx), %xmm9
-	movups	112(ctx), %xmm10
-	movups	96(ctx), %xmm11
-	movups	80(ctx), %xmm12
-	movups	64(ctx), %xmm13
-	movups	48(ctx), %xmm14
-	movups	32(ctx), %xmm15
-//	movups	16(ctx), %xmm14
-//	movups	(ctx), %xmm15
-#endif
-
-#if defined	__x86_64__
-
-	sub		$4, num_blk					// pre decrement num_blk by 4
-	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code
-0:
-	movups	(ibuf), %xmm1				// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
-	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf
-	movups	48(ibuf), %xmm15			// tmp = 4th ibuf
-
-	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-	pxor    %xmm3, %xmm14
-	pxor    %xmm3, %xmm15
-
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm4, %xmm14
-    aesdec  %xmm4, %xmm15
-
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm5, %xmm14
-    aesdec  %xmm5, %xmm15
-
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm14
-    aesdec  %xmm6, %xmm15
-
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm14
-    aesdec  %xmm7, %xmm15
-
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm8, %xmm14
-    aesdec  %xmm8, %xmm15
-
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm9, %xmm14
-    aesdec  %xmm9, %xmm15
-
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm10, %xmm14
-    aesdec  %xmm10, %xmm15
-
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm11, %xmm14
-    aesdec  %xmm11, %xmm15
-
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-	movups	48(ctx), %xmm12
-
-    aesdec  %xmm13, %xmm1
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm13, %xmm14
-    aesdec  %xmm13, %xmm15
-	movups	32(ctx), %xmm13
-
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-	movups	16(ctx), %xmm12
-
-    aesdec  %xmm13, %xmm1
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm13, %xmm14
-    aesdec  %xmm13, %xmm15
-	movups	(ctx), %xmm13
-
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-	movups	80(ctx), %xmm12
-
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-    aesdeclast  %xmm13, %xmm14
-    aesdeclast  %xmm13, %xmm15
-	movups	64(ctx), %xmm13
-
-	pxor	iv, %xmm1				// obuf ^= iv; 
-	movups	(ibuf), iv				// memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm2				// obuf ^= iv; 
-	movups	16(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm14				// obuf ^= iv; 
-	movups	32(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm15				// obuf ^= iv; 
-	movups	48(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-	movups	%xmm1, (obuf)			// write 1st obuf
-	movups	%xmm2, 16(obuf)			// write 2nd obuf
-	movups	%xmm14, 32(obuf)		// write 3rd obuf
-	movups	%xmm15, 48(obuf)		// write 4th obuf
-
-	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE*4; 
-	add		$64, obuf				// obuf += AES_BLOCK_SIZE*4;	
-
-	sub		$4, num_blk				// num_blk -= 4
-	jge		0b						// if num_blk > 0, repeat the loop
-
-9:	add		$4, num_blk				//	post incremtn num_blk by 4
-	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
-
-	movups	48(ctx), %xmm14
-	movups	32(ctx), %xmm15
-
-#else
-
-	sub		$4, num_blk				// pre decrement num_blk by 4
-	jl		9f						// if num_blk < 4, skip the per-pair processing code
-0:
-	movups	(ibuf), %xmm1			// tmp = 1st ibuf
-	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf
-	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf
-	movups	48(ibuf), %xmm5			// tmp = 4th ibuf
-
-	// aes_decrypt
-	// for i386, sequentially load expanded keys into xmm6/xmm7
-	movups	208(ctx), %xmm6
-	pxor    %xmm3, %xmm1
-	pxor    %xmm3, %xmm2
-	pxor    %xmm3, %xmm4
-	pxor    %xmm3, %xmm5
-
-	movups	192(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	176(ctx), %xmm6
-	aesdec  %xmm7, %xmm1
-	aesdec	%xmm7, %xmm2
-	aesdec	%xmm7, %xmm4
-	aesdec	%xmm7, %xmm5
-
-	movups	160(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	144(ctx), %xmm6
-	aesdec	%xmm7, %xmm1
-	aesdec	%xmm7, %xmm2
-	aesdec	%xmm7, %xmm4
-	aesdec	%xmm7, %xmm5
-
-	movups	128(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	112(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	96(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	80(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	64(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	48(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	32(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-	movups	16(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-	movups	0(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-    aesdeclast  %xmm7, %xmm4
-    aesdeclast  %xmm7, %xmm5
-
-	pxor	iv, %xmm1				// 1st obuf ^= iv; 
-	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm2				// 2nd obuf ^= iv; 
-	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm4				// 3rd obuf ^= iv; 
-	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
-	pxor	iv, %xmm5				// 4th obuf ^= iv; 
-	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-	movups	%xmm1, (obuf)			// write 1st obuf
-	movups	%xmm2, 16(obuf)			// write 2nd obuf
-	movups	%xmm4, 32(obuf)			// write 3rd obuf
-	movups	%xmm5, 48(obuf)			// write 4th obuf
-
-	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE * 4; 
-	add		$64, obuf				// obuf += AES_BLOCK_SIZE * 4;	
-
-	sub		$4, num_blk				// num_blk -= 4
-	jge		0b						// if num_blk > 0, repeat the loop
-
-
-9:	add		$4, num_blk				//	post incremtn num_blk by 4
-	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
-
-	movups	208(ctx), %xmm4
-	movups	192(ctx), %xmm5
-	movups	176(ctx), %xmm6
-	movups	160(ctx), %xmm7
-
-#endif
-
-0:
-	movups	(ibuf), %xmm2				// tmp = ibuf
-
-	// aes_decrypt
-	pxor	%xmm3, %xmm2
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm7, %xmm2
-#if defined	__x86_64__
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm14, %xmm2
-    aesdec  %xmm15, %xmm2
-#else
-	movups	144(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	128(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	112(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	96(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	80(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	64(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	48(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	32(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-#endif
-	movups	16(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-	movups	(ctx), %xmm1
-    aesdeclast  %xmm1, %xmm2
-
-	pxor	iv, %xmm2			// obuf ^= iv; 
-	movups	(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-	movups	%xmm2, (obuf)		// write obuf
-
-	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; 
-	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	
-	sub		$1, num_blk				// num_blk --
-	jg		0b						// if num_blk > 0, repeat the loop
-
-	jmp		L_HW_cbc_done
-
-	//
-	// --------- END of aes_decrypt_cbc_hw  -------------------
-	//
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 31/01/2006
+
+ These subroutines implement multiple block AES modes for ECB, CBC, CFB,
+ OFB and CTR encryption,  The code provides support for the VIA Advanced 
+ Cryptography Engine (ACE).
+
+ NOTE: In the following subroutines, the AES contexts (ctx) must be
+ 16 byte aligned if VIA ACE is being used
+*/
+
+
+/* ---------------------------------------------------------------------------------------------------------------- 
+
+	aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
+
+	For simplicity, I am assuming all variables are in 128-bit data type.
+
+	aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
+	{
+		while(num_blk--) {
+			*iv ^= *ibuf++;
+			aes_encrypt(iv, iv, ctx);
+			*obuf++ = *iv;
+		}
+		return 0;
+	}
+
+	The following is an implementation of this function using Intel AESNI.
+	This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. 
+	Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
+	to this aesni-based function should it detecs that aesni is available.
+	Blindly call this function SURELY will cause a CRASH on systems with no aesni support. 
+
+	Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
+	are serially chained. This prevents us from arranging several blocks for encryption in parallel.
+
+   ----------------------------------------------------------------------------------------------------------------*/
+
+	.text
+	.align	4,0x90
+	.globl	_aes_encrypt_cbc_hw
+_aes_encrypt_cbc_hw:
+
+	// push/save registers for local use
+#if	defined	__i386__
+
+	push	%ebp
+	movl	%esp, %ebp
+	push	%ebx
+	push	%edi
+
+	#define	sp	%esp
+
+#else	// __x86_64__
+
+	push	%rbp
+	mov		%rsp, %rbp
+	push	%rbx
+	push	%r13
+	push	%r14
+	push	%r15
+
+	#define	sp	%rsp
+
+#endif
+
+	// if this is kernel code, need to save used xmm registers
+#ifdef	KERNEL
+
+#if defined __i386__
+	sub		$(8*16), %esp			// for possible xmm0-xmm7 save/restore
+#else
+	sub		$(16*16), %rsp		// xmm0-xmm15 save/restore	
+#endif
+
+	movaps	%xmm0, (sp)
+	movaps	%xmm1, 16(sp)
+	movaps	%xmm2, 32(sp)
+	movaps	%xmm3, 48(sp)
+	movaps	%xmm4, 64(sp)
+	movaps	%xmm5, 80(sp)
+	movaps	%xmm6, 96(sp)
+	movaps	%xmm7, 112(sp)
+#if defined	__x86_64__
+	movaps	%xmm8, 16*8(sp)
+	movaps	%xmm9, 16*9(sp)
+	movaps	%xmm10, 16*10(sp)
+	movaps	%xmm11, 16*11(sp)
+	movaps	%xmm12, 16*12(sp)
+	movaps	%xmm13, 16*13(sp)
+	movaps	%xmm14, 16*14(sp)
+	movaps	%xmm15, 16*15(sp)
+#endif	// __x86_64__
+
+#endif	// KERNEL
+
+	#define	iv	%xmm0
+
+#ifdef	__i386__
+
+	mov		12(%ebp), %eax			// in_iv
+	mov		24(%ebp), %edx			// ctx
+	movups	(%eax), iv				// iv = in_iv	
+	mov		8(%ebp), %ebx			// ibuf
+	mov		16(%ebp), %ecx			// num_blk
+	mov		20(%ebp), %edi			// obuf
+
+	#define	ibuf	%ebx
+	#define	obuf	%edi
+	#define num_blk	%ecx	
+	#define	ctx		%edx
+
+#else
+
+	mov		%rdi, %rbx				// ibuf
+	movups	(%rsi), iv				// iv = in_iv
+	mov		%rdx, %r13				// num_blk
+	mov		%rcx, %r14				// obuf
+	mov		%r8, %r15				// ctx	
+
+	#define	ibuf	%rbx
+	#define	num_blk	%r13d
+	#define	obuf	%r14	
+	#define	ctx		%r15
+
+#endif
+
+	mov		240(ctx), %eax			// aes length
+	cmp		$160, %eax				// aes-128 encrypt ?
+	je		L_encrypt_128
+	cmp		$192, %eax				// aes-192 encrypt ?
+	je		L_encrypt_192
+	cmp		$224, %eax				// aes-256 encrypt ?
+	je		L_encrypt_256
+	mov		$-1, %eax				// return error
+	jmp		L_error	
+
+	//
+	// aes-128 encrypt_cbc operation, up to L_HW_cbc_done
+	//
+
+L_encrypt_128:
+
+	cmp		$1, num_blk				// check number of block
+	jl		L_HW_cbc_done			// should it be less than 1, nothing to do
+
+	movups	(ctx), %xmm2			// key0
+	movups	16(ctx), %xmm3			// key1
+	movups	32(ctx), %xmm4			// key2
+	movups	48(ctx), %xmm5			// key3
+	movups	64(ctx), %xmm6			// key4
+	movups	80(ctx), %xmm7			// key5
+#if defined	__x86_64__
+	movups	96(ctx), %xmm8			// key6
+	movups	112(ctx), %xmm9			// key7
+	movups	128(ctx), %xmm10		// key8
+	movups	144(ctx), %xmm11		// key9
+	movups	160(ctx), %xmm12		// keyA
+#endif
+
+	// while (num_blk--) {
+	//			*iv ^= *ibuf++;
+	//			aes_encrypt(iv, iv, ctx);
+	//			*obuf++ = *iv;
+	// }
+0:
+	movups	(ibuf), %xmm1				// *ibuf
+	pxor    %xmm2, iv					// 1st instruction inside aes_encrypt
+	pxor	%xmm1, iv					// *iv ^= *ibuf
+
+	// finishing up the rest of aes_encrypt
+    aesenc  %xmm3, iv
+    aesenc  %xmm4, iv
+    aesenc  %xmm5, iv
+    aesenc  %xmm6, iv
+    aesenc  %xmm7, iv
+#if defined	__x86_64__
+    aesenc  %xmm8, iv
+    aesenc  %xmm9, iv
+    aesenc  %xmm10, iv
+    aesenc  %xmm11, iv
+    aesenclast  %xmm12, iv
+#else
+	movups	96(ctx), %xmm1				// key6
+    aesenc  %xmm1, iv
+	movups	112(ctx), %xmm1				// key7
+    aesenc  %xmm1, iv
+	movups	128(ctx), %xmm1				// key8
+    aesenc  %xmm1, iv
+	movups	144(ctx), %xmm1				// key9
+    aesenc  %xmm1, iv
+	movups	160(ctx), %xmm1				// keyA
+    aesenclast  %xmm1, iv
+#endif
+
+	movups	iv, (obuf)					// *obuf = *iv;
+	add		$16, obuf					// obuf++;
+	add		$16, ibuf					// ibuf++;
+	sub		$1, num_blk					// num_blk --
+	jg		0b							// if num_blk > 0, repeat the loop
+
+	// the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
+
+L_HW_cbc_done:
+
+	xor		%eax, %eax				// to return CRYPT_OK
+
+L_error:
+
+	// if kernel, restore xmm registers
+#ifdef	KERNEL 
+	movaps	0(sp), %xmm0
+	movaps	16(sp), %xmm1
+	movaps	32(sp), %xmm2
+	movaps	48(sp), %xmm3
+	movaps	64(sp), %xmm4
+	movaps	80(sp), %xmm5
+	movaps	96(sp), %xmm6
+	movaps	112(sp), %xmm7
+#if defined	__x86_64__
+	movaps	16*8(sp), %xmm8
+	movaps	16*9(sp), %xmm9
+	movaps	16*10(sp), %xmm10
+	movaps	16*11(sp), %xmm11
+	movaps	16*12(sp), %xmm12
+	movaps	16*13(sp), %xmm13
+	movaps	16*14(sp), %xmm14
+	movaps	16*15(sp), %xmm15
+#endif	// __x86_64__
+#endif	// KERNEL
+
+	// release used stack memory, restore used callee-saved registers, and return 
+#if	defined	__i386__
+#ifdef	KERNEL
+	add		$(8*16), %esp
+#endif
+	pop		%edi
+	pop		%ebx
+#else
+#ifdef	KERNEL
+	add		$(16*16), %rsp	
+#endif
+	pop		%r15
+	pop		%r14
+	pop		%r13
+	pop		%rbx
+#endif
+	leave
+	ret
+
+	//
+	// aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
+	//
+
+L_encrypt_192:
+
+	cmp		$1, num_blk				// check number of block
+	jl		L_HW_cbc_done			// should it be less than 1, nothing to do
+
+	movups	(ctx), %xmm2			// key0
+	movups	16(ctx), %xmm3			// key1
+	movups	32(ctx), %xmm4			// key2
+	movups	48(ctx), %xmm5			// key3
+	movups	64(ctx), %xmm6			// key4
+	movups	80(ctx), %xmm7			// key5
+#if defined	__x86_64__
+	movups	96(ctx), %xmm8			// key6
+	movups	112(ctx), %xmm9			// key7
+	movups	128(ctx), %xmm10		// key8
+	movups	144(ctx), %xmm11		// key9
+	movups	160(ctx), %xmm12		// keyA
+	movups	176(ctx), %xmm13		// keyB
+	movups	192(ctx), %xmm14		// keyC
+#endif
+	
+	// while (num_blk--) {
+	//			*iv ^= *ibuf++;
+	//			aes_encrypt(iv, iv, ctx);
+	//			*obuf++ = *iv;
+	// }
+0:
+	movups	(ibuf), %xmm1			// *ibuf
+	pxor	%xmm1, iv				// *iv ^= ibuf
+
+	// aes_encrypt(iv, iv, ctx);
+
+	pxor    %xmm2, iv
+    aesenc  %xmm3, iv
+    aesenc  %xmm4, iv
+    aesenc  %xmm5, iv
+    aesenc  %xmm6, iv
+    aesenc  %xmm7, iv
+#if defined	__x86_64__
+    aesenc  %xmm8, iv
+    aesenc  %xmm9, iv
+    aesenc  %xmm10, iv
+    aesenc  %xmm11, iv
+    aesenc  %xmm12, iv
+    aesenc  %xmm13, iv
+    aesenclast  %xmm14, iv
+#else
+	movups	96(ctx), %xmm1
+    aesenc  %xmm1, iv
+	movups	112(ctx), %xmm1
+    aesenc  %xmm1, iv
+	movups	128(ctx), %xmm1
+    aesenc  %xmm1, iv
+	movups	144(ctx), %xmm1
+    aesenc  %xmm1, iv
+	movups	160(ctx), %xmm1
+    aesenc  %xmm1, iv
+	movups	176(ctx), %xmm1
+    aesenc  %xmm1, iv
+	movups	192(ctx), %xmm1
+    aesenclast  %xmm1, iv
+#endif
+
+	movups	iv, (obuf)				// *obuf = *iv;
+	add		$16, ibuf				// ibuf++
+	add		$16, obuf				// obuf++
+
+	sub		$1, num_blk				// num_blk --
+	jg		0b						// if num_blk > 0, repeat the loop
+
+	jmp		L_HW_cbc_done			// share with the common exit code
+
+	//
+	// aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
+	//
+
+L_encrypt_256:
+
+	cmp		$1, num_blk				// check number of block
+	jl		L_HW_cbc_done			// should it be less than 1, nothing to do
+
+	movups	(ctx), %xmm2			// key0
+	movups	16(ctx), %xmm3			// key1
+	movups	32(ctx), %xmm4			// key2
+	movups	48(ctx), %xmm5			// key3
+	movups	64(ctx), %xmm6			// key4
+	movups	80(ctx), %xmm7			// key5
+#if defined	__x86_64__
+	movups	96(ctx), %xmm8			// key6
+	movups	112(ctx), %xmm9			// key7
+	movups	128(ctx), %xmm10		// key8
+	movups	144(ctx), %xmm11		// key9
+	movups	160(ctx), %xmm12		// keyA
+	movups	176(ctx), %xmm13		// keyB
+	movups	192(ctx), %xmm14		// keyC
+	movups	208(ctx), %xmm15		// keyD
+	// movups	224(ctx), %xmm1		// keyE
+#endif
+
+	// while (num_blk--) {
+	//			*iv ^= *ibuf++;
+	//			aes_encrypt(iv, iv, ctx);
+	//			*obuf++ = *iv;
+	// }
+0:
+	movups	(ibuf), %xmm1			// *ibuf
+	pxor	%xmm1, iv				// *iv ^= ibuf
+	
+	// aes_encrypt(iv, iv, ctx);
+	pxor    %xmm2, iv
+    aesenc  %xmm3, iv
+    aesenc  %xmm4, iv
+    aesenc  %xmm5, iv
+    aesenc  %xmm6, iv
+    aesenc  %xmm7, iv
+#if defined	__x86_64__
+	movups	224(ctx), %xmm1			// keyE
+    aesenc  %xmm8, iv
+    aesenc  %xmm9, iv
+    aesenc  %xmm10, iv
+    aesenc  %xmm11, iv
+    aesenc  %xmm12, iv
+    aesenc  %xmm13, iv
+    aesenc  %xmm14, iv
+    aesenc  %xmm15, iv
+    aesenclast  %xmm1, iv
+#else
+	movups	96(ctx), %xmm1			// key6
+    aesenc  %xmm1, iv
+	movups	112(ctx), %xmm1			// key7
+    aesenc  %xmm1, iv
+	movups	128(ctx), %xmm1			// key8
+    aesenc  %xmm1, iv
+	movups	144(ctx), %xmm1			// key9
+    aesenc  %xmm1, iv
+	movups	160(ctx), %xmm1			// keyA
+    aesenc  %xmm1, iv
+	movups	176(ctx), %xmm1			// keyB
+    aesenc  %xmm1, iv
+	movups	192(ctx), %xmm1			// keyC
+    aesenc  %xmm1, iv
+	movups	208(ctx), %xmm1			// keyD
+    aesenc  %xmm1, iv
+	movups	224(ctx), %xmm1			// keyE
+    aesenclast  %xmm1, iv
+#endif
+
+	movups	iv, (obuf)				// *obuf = *iv;
+	add		$16, ibuf				// ibuf++
+	add		$16, obuf				// obuf++
+
+	sub		$1, num_blk				// num_blk --
+	jg		0b						// if num_blk > 0, repeat the loop
+
+	jmp		L_HW_cbc_done			// share with the common exit code
+
+
+
+	//
+	// --------- END of aes_encrypt_cbc_hw  -------------------
+	//
+
+
+/* ---------------------------------------------------------------------------------------------------------------- 
+
+	aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
+
+	For simplicity, I am assuming all variables are in 128-bit data type.
+
+	aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
+	{
+		while(num_blk--) {
+			aes_decrypt(ibuf, obuf, ctx);
+			*obuf++ ^= *iv;
+			*iv = *ibuf++;
+		}
+		return 0;
+	}
+
+	The following is an implementation of this function using Intel AESNI.
+	This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. 
+	Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
+	to this aesni-based function should it detecs that aesni is available.
+	Blindly call this function SURELY will cause a CRASH on systems with no aesni support. 
+
+	Note that the decryption operation is not related over blocks.
+	This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
+	This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
+	The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
+
+	Example C code for packing 4 blocks in an iteration is shown as follows:
+
+		while ((num_blk-=4)>=0) {
+
+			// the following 4 functions can be interleaved to exploit parallelism
+			aes_decrypt(ibuf, obuf, ctx);
+			aes_decrypt(ibuf+1, obuf+1, ctx);
+			aes_decrypt(ibuf+2, obuf+2, ctx);
+			aes_decrypt(ibuf+3, obuf+3, ctx);
+
+			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
+			*iv = ibuf[3];		ibuf += 4; 	obuf += 4;
+		}
+		num_blk+=4;
+
+   ----------------------------------------------------------------------------------------------------------------*/
+
+	.text
+	.align	4,0x90
+	.globl	_aes_decrypt_cbc_hw
+_aes_decrypt_cbc_hw:
+
+	// push/save registers for local use
+#if	defined	__i386__
+
+	push	%ebp
+	movl	%esp, %ebp
+	push	%ebx					// ibuf
+	push	%edi					// obuf
+
+	#define	sp	%esp
+
+#else	// __x86_64__
+
+	push	%rbp
+	mov		%rsp, %rbp
+	push	%rbx
+	push	%r13
+	push	%r14
+	push	%r15
+
+	#define	sp	%rsp
+
+#endif
+
+
+	// if kernel, allocate stack space to save xmm registers
+#ifdef	KERNEL
+#if defined __i386__
+	sub		$(8*16), %esp
+#else
+	sub		$(16*16), %rsp
+#endif
+	movaps	%xmm0, (sp)
+	movaps	%xmm1, 16(sp)
+	movaps	%xmm2, 32(sp)
+	movaps	%xmm3, 48(sp)
+	movaps	%xmm4, 64(sp)
+	movaps	%xmm5, 80(sp)
+	movaps	%xmm6, 96(sp)
+	movaps	%xmm7, 112(sp)
+#if defined	__x86_64__
+	movaps	%xmm8, 16*8(sp)
+	movaps	%xmm9, 16*9(sp)
+	movaps	%xmm10, 16*10(sp)
+	movaps	%xmm11, 16*11(sp)
+	movaps	%xmm12, 16*12(sp)
+	movaps	%xmm13, 16*13(sp)
+	movaps	%xmm14, 16*14(sp)
+	movaps	%xmm15, 16*15(sp)
+#endif	// __x86_64__
+#endif
+
+	#undef	iv
+	#define	iv	%xmm0
+
+#if defined	__i386__
+	mov		12(%ebp), %eax			// in_iv
+	mov		24(%ebp), %edx			// ctx
+	movups	(%eax), iv				// iv = in_iv	
+	mov		8(%ebp), %ebx			// ibuf
+	mov		16(%ebp), %ecx			// num_blk
+	mov		20(%ebp), %edi			// obuf
+
+	#define	ibuf	%ebx
+	#define	obuf	%edi
+	#define num_blk	%ecx	
+	#define	ctx		%edx
+
+#else	//	__x86_64__, rdi/rsi/rdx/rcx/r8
+
+	mov		%rdi, %rbx				// ibuf
+	movups	(%rsi), iv				// iv = in_iv
+	mov		%rdx, %r13				// num_blk
+	mov		%rcx, %r14				// obuf
+	mov		%r8, %r15				// ctx	
+
+	#define	ibuf	%rbx
+	#define	num_blk	%r13d
+	#define	obuf	%r14	
+	#define	ctx		%r15
+
+#endif
+
+	mov		240(ctx), %eax			// aes length
+	cmp		$160, %eax				// aes-128 decrypt
+	je		L_decrypt_128
+	cmp		$192, %eax				// aes-192 decrypt
+	je		L_decrypt_192
+	cmp		$224, %eax				// aes-256 decrypt
+	je		L_decrypt_256
+
+	mov		$-1, %eax				// wrong aes length, to return -1
+	jmp		L_error					// early exit due to wrong aes length
+
+
+	//
+	// aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
+	//
+
+L_decrypt_128:
+
+	cmp		$1, num_blk
+	jl		L_HW_cbc_done			// if num_blk < 1, early return
+
+	// aes-128 decrypt expanded keys
+	movups	160(ctx), %xmm3
+	movups	144(ctx), %xmm4
+	movups	128(ctx), %xmm5
+	movups	112(ctx), %xmm6
+	movups	96(ctx), %xmm7
+#if defined	__x86_64__
+	movups	80(ctx), %xmm8
+	movups	64(ctx), %xmm9
+	movups	48(ctx), %xmm10
+	movups	32(ctx), %xmm11
+	movups	16(ctx), %xmm12
+	movups	0(ctx), %xmm13
+#endif
+
+	// performs 4 block decryption in an iteration to exploit decrypt in parallel
+
+	//		while ((num_blk-=4)>=0) {
+	//			aes_decrypt(ibuf, obuf, ctx);
+	//			aes_decrypt(ibuf+1, obuf+1, ctx);
+	//			aes_decrypt(ibuf+2, obuf+2, ctx);
+	//			aes_decrypt(ibuf+3, obuf+3, ctx);
+	//			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
+	//			*iv = ibuf[3]; ibuf += 4; obuf += 4;
+	//		}
+
+	sub		$4, num_blk					// pre decrement num_blk by 4
+	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code
+
+0:
+
+
+#if defined	__x86_64__
+
+	movups	(ibuf), %xmm1				// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
+	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf
+	movups	48(ibuf), %xmm15			// tmp = 4th ibuf
+
+	// for x86_64, the expanded keys are already stored in xmm3-xmm13
+
+	// aes-128 decrypt round 0 per 4 blocks
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+	pxor    %xmm3, %xmm14
+	pxor    %xmm3, %xmm15
+
+	// aes-128 decrypt round 1 per 4 blocks
+    aesdec  %xmm4, %xmm1
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm4, %xmm14
+    aesdec  %xmm4, %xmm15
+
+	// aes-128 decrypt round 2 per 4 blocks
+    aesdec  %xmm5, %xmm1
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm5, %xmm14
+    aesdec  %xmm5, %xmm15
+
+	// aes-128 decrypt round 3 per 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm14
+    aesdec  %xmm6, %xmm15
+
+	// aes-128 decrypt round 4 per 4 blocks
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm14
+    aesdec  %xmm7, %xmm15
+
+	// aes-128 decrypt round 5 per 4 blocks
+    aesdec  %xmm8, %xmm1
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm8, %xmm14
+    aesdec  %xmm8, %xmm15
+
+	// aes-128 decrypt round 6 per 4 blocks
+    aesdec  %xmm9, %xmm1
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm9, %xmm14
+    aesdec  %xmm9, %xmm15
+
+	// aes-128 decrypt round 7 per 4 blocks
+    aesdec  %xmm10, %xmm1
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm10, %xmm14
+    aesdec  %xmm10, %xmm15
+
+	// aes-128 decrypt round 8 per 4 blocks
+    aesdec  %xmm11, %xmm1
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm11, %xmm14
+    aesdec  %xmm11, %xmm15
+
+	// aes-128 decrypt round 9 per 4 blocks
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm12, %xmm14
+    aesdec  %xmm12, %xmm15
+
+	// aes-128 decrypt round 10 (last) per 4 blocks
+    aesdeclast  %xmm13, %xmm1
+    aesdeclast  %xmm13, %xmm2
+    aesdeclast  %xmm13, %xmm14
+    aesdeclast  %xmm13, %xmm15
+
+	pxor	iv, %xmm1				// obuf[0] ^= *iv; 
+	movups	(ibuf), iv				// ibuf[0]
+	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0]; 
+	movups	16(ibuf), iv			// ibuf[1]
+	pxor	iv, %xmm14				// obuf[2] ^= ibuf[1]; 
+	movups	32(ibuf), iv			// ibuf[2] 
+	pxor	iv, %xmm15				// obuf[3] ^= obuf[2]; 
+	movups	48(ibuf), iv			// *iv = ibuf[3]
+
+	movups	%xmm1, (obuf)			// write 1st obuf
+	movups	%xmm2, 16(obuf)			// write 2nd obuf
+	movups	%xmm14, 32(obuf)		// write 3rd obuf
+	movups	%xmm15, 48(obuf)		// write 4th obuf
+
+
+#else
+
+	// aes_decrypt_cbc per 4 blocks using aes-128 for i386
+	// xmm1/xmm2/xmm4/xmm5 used for obuf per block
+	// xmm3 = key0
+	// xmm0 = iv
+	// xmm6/xmm7 dynamically load with other expanded keys
+
+	movups	(ibuf), %xmm1			// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf
+	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf
+	movups	48(ibuf), %xmm5			// tmp = 4th ibuf
+
+	// aes_decrypt
+	// for i386, sequentially load expanded keys into xmm6/xmm7
+
+	movups	144(ctx), %xmm6			// key1
+
+	// aes-128 decrypt round 0 per 4 blocks
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+	pxor    %xmm3, %xmm4
+	pxor    %xmm3, %xmm5
+
+	movups	128(ctx), %xmm7			// key2
+
+	// aes-128 decrypt round 1 per 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	112(ctx), %xmm6			// key3
+
+	// aes-128 decrypt round 2 per 4 blocks
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	96(ctx), %xmm7			// key4
+
+	// aes-128 decrypt round 3 per 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	80(ctx), %xmm6			// key5
+
+	// aes-128 decrypt round 4 per 4 blocks
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	64(ctx), %xmm7			// key6
+
+	// aes-128 decrypt round 5 per 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	48(ctx), %xmm6			// key7
+
+	// aes-128 decrypt round 6 per 4 blocks
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	32(ctx), %xmm7			// key8
+
+	// aes-128 decrypt round 7 per 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	16(ctx), %xmm6			// key9
+
+	// aes-128 decrypt round 8 per 4 blocks
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	0(ctx), %xmm7			// keyA
+
+	// aes-128 decrypt round 9 per 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	// aes-128 decrypt round 10 (last) per 4 blocks
+    aesdeclast  %xmm7, %xmm1
+    aesdeclast  %xmm7, %xmm2
+    aesdeclast  %xmm7, %xmm4
+    aesdeclast  %xmm7, %xmm5
+
+	pxor	iv, %xmm1				// 1st obuf ^= iv; 
+	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm2				// 2nd obuf ^= iv; 
+	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm4				// 3rd obuf ^= iv; 
+	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm5				// 4th obuf ^= iv; 
+	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+	movups	%xmm1, (obuf)			// write 1st obuf
+	movups	%xmm2, 16(obuf)			// write 2nd obuf
+	movups	%xmm4, 32(obuf)			// write 3rd obuf
+	movups	%xmm5, 48(obuf)			// write 4th obuf
+#endif
+
+	add		$64, ibuf				// ibuf += 4; 
+	add		$64, obuf				// obuf += 4;	
+
+	sub		$4, num_blk				// num_blk -= 4
+	jge		0b						// if num_blk > 0, repeat the loop
+
+9:	add		$4, num_blk				// post incremtn num_blk by 4
+	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
+
+#if defined	__i386__
+	// updated as they might be needed as expanded keys in the remaining
+	movups	144(ctx), %xmm4
+	movups	128(ctx), %xmm5
+	movups	112(ctx), %xmm6
+	movups	96(ctx), %xmm7
+#endif
+
+	test	$2, num_blk				// check whether num_blk has 2 blocks
+	je		9f						// if num_blk & 2 == 0, skip the per-pair processing code
+
+	// do the remaining 2 blocks together
+
+	movups	(ibuf), %xmm1				// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
+
+	// aes_decrypt
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+    aesdec  %xmm4, %xmm1
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm5, %xmm1
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+#if defined	__x86_64__
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm8, %xmm1
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm9, %xmm1
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm10, %xmm1
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm11, %xmm1
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdeclast  %xmm13, %xmm1
+    aesdeclast  %xmm13, %xmm2
+#else
+	movups	80(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+	movups	64(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+	movups	48(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+	movups	32(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+	movups	16(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+	movups	0(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdeclast  %xmm7, %xmm1
+    aesdeclast  %xmm7, %xmm2
+	movups	112(ctx), %xmm6
+	movups	96(ctx), %xmm7
+#endif
+
+	pxor	iv, %xmm1				// obuf[0] ^= *iv; 
+	movups	(ibuf), iv				// ibuf[0]
+	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0]
+	movups	16(ibuf), iv			// *iv = ibuf[1]
+
+	movups	%xmm1, (obuf)			// write obuf[0]
+	movups	%xmm2, 16(obuf)			// write obuf[1]
+
+	add		$32, ibuf				// ibuf += 2
+	add		$32, obuf				// obuf += 2
+
+9:
+	test	$1, num_blk				// check whether num_blk has residual 1 block
+	je		L_HW_cbc_done			// if num_blk == 0, no need for residual processing code
+	
+	movups	(ibuf), %xmm2				// tmp = ibuf
+	// aes_decrypt
+	pxor    %xmm3, %xmm2
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm7, %xmm2
+#if defined	__x86_64__
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm12, %xmm2
+    aesdeclast  %xmm13, %xmm2
+#else
+	movups	80(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	64(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	48(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	32(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	16(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	(ctx), %xmm1
+    aesdeclast  %xmm1, %xmm2
+#endif
+
+	pxor	iv, %xmm2			// *obuf ^= *iv; 
+	movups	(ibuf), iv			// *iv = *ibuf;
+	movups	%xmm2, (obuf)		// write *obuf
+
+	jmp		L_HW_cbc_done
+
+	//
+	// aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
+	//
+
+L_decrypt_192:
+
+	cmp		$1, num_blk
+	jl		L_HW_cbc_done			// if num_blk < 1, early return
+
+	// aes-192 decryp expanded keys
+	movups	192(ctx), %xmm3
+	movups	176(ctx), %xmm4
+	movups	160(ctx), %xmm5
+	movups	144(ctx), %xmm6
+	movups	128(ctx), %xmm7
+#if defined	__x86_64__
+	movups	112(ctx), %xmm8
+	movups	96(ctx), %xmm9
+	movups	80(ctx), %xmm10
+	movups	64(ctx), %xmm11
+	movups	48(ctx), %xmm12
+	movups	32(ctx), %xmm13
+	movups	16(ctx), %xmm14
+	movups	(ctx), %xmm15
+#endif
+
+	// performs 4 block decryption in an iteration to exploit decrypt in parallel
+
+	//		while ((num_blk-=4)>=0) {
+	//			aes_decrypt(ibuf, obuf, ctx);
+	//			aes_decrypt(ibuf+1, obuf+1, ctx);
+	//			aes_decrypt(ibuf+2, obuf+2, ctx);
+	//			aes_decrypt(ibuf+3, obuf+3, ctx);
+	//			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
+	//			*iv = ibuf[3]; ibuf += 4; obuf += 4;
+	//		}
+
+	sub		$4, num_blk					// pre decrement num_blk by 4
+	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code
+0:
+
+#if defined	__x86_64__
+
+	movups	(ibuf), %xmm1				// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
+	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf
+	movups	48(ibuf), %xmm15			// tmp = 4th ibuf
+
+	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
+	// use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
+
+	// round 0 for 4 blocks
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+	pxor    %xmm3, %xmm14
+	pxor    %xmm3, %xmm15
+
+	// round 1 for 4 blocks
+    aesdec  %xmm4, %xmm1
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm4, %xmm14
+    aesdec  %xmm4, %xmm15
+
+	// round 2 for 4 blocks
+    aesdec  %xmm5, %xmm1
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm5, %xmm14
+    aesdec  %xmm5, %xmm15
+
+	// round 3 for 4 blocks
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm14
+    aesdec  %xmm6, %xmm15
+
+	// round 4 for 4 blocks
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm14
+    aesdec  %xmm7, %xmm15
+
+	// round 5 for 4 blocks
+    aesdec  %xmm8, %xmm1
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm8, %xmm14
+    aesdec  %xmm8, %xmm15
+
+	// round 6 for 4 blocks
+    aesdec  %xmm9, %xmm1
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm9, %xmm14
+    aesdec  %xmm9, %xmm15
+
+	// round 7 for 4 blocks
+    aesdec  %xmm10, %xmm1
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm10, %xmm14
+    aesdec  %xmm10, %xmm15
+
+	// round 8 for 4 blocks
+    aesdec  %xmm11, %xmm1
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm11, %xmm14
+    aesdec  %xmm11, %xmm15
+
+	// round 9 for 4 blocks
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm12, %xmm14
+    aesdec  %xmm12, %xmm15
+
+	movups	16(ctx), %xmm12
+
+	// round A for 4 blocks
+    aesdec  %xmm13, %xmm1
+    aesdec  %xmm13, %xmm2
+    aesdec  %xmm13, %xmm14
+    aesdec  %xmm13, %xmm15
+
+	movups	(ctx), %xmm13
+
+	// round B for 4 blocks
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm12, %xmm14
+    aesdec  %xmm12, %xmm15
+
+	movups	48(ctx), %xmm12		// restore %xmm12 to its original key
+
+	// round C (last) for 4 blocks
+    aesdeclast  %xmm13, %xmm1
+    aesdeclast  %xmm13, %xmm2
+    aesdeclast  %xmm13, %xmm14
+    aesdeclast  %xmm13, %xmm15
+
+	movups	32(ctx), %xmm13		// restore %xmm13 to its original key
+
+	pxor	iv, %xmm1				// obuf[0] ^= *iv; 
+	movups	(ibuf), iv				// ibuf[0]
+	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0] 
+	movups	16(ibuf), iv			// ibuf[1]
+	pxor	iv, %xmm14				// obuf[2] ^= ibuf[1] 
+	movups	32(ibuf), iv			// ibuf[2] 
+	pxor	iv, %xmm15				// obuf[3] ^= ibuf[2] 
+	movups	48(ibuf), iv			// *iv = ibuf[3] 
+
+	movups	%xmm1, (obuf)			// write 1st obuf
+	movups	%xmm2, 16(obuf)			// write 2nd obuf
+	movups	%xmm14, 32(obuf)		// write 3rd obuf
+	movups	%xmm15, 48(obuf)		// write 4th obuf
+
+	add		$64, ibuf				// ibuf += 4; 
+	add		$64, obuf				// obuf += 4;	
+
+	sub		$4, num_blk				// num_blk -= 4
+	jge		0b						// if num_blk > 0, repeat the loop
+
+9:	add		$4, num_blk				// post incremtn num_blk by 4
+	je		L_HW_cbc_done			// if num_blk == 0, prepare to return 
+
+	movups	16(ctx), %xmm14			// restore %xmm14 to its key
+	movups	(ctx), %xmm15			// restore %xmm15 to its key
+
+#else
+
+	movups	(ibuf), %xmm1			// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf
+	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf
+	movups	48(ibuf), %xmm5			// tmp = 4th ibuf
+
+	// aes_decrypt
+	// for i386, sequentially load expanded keys into xmm6/xmm7
+	movups	176(ctx), %xmm6
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+	pxor    %xmm3, %xmm4
+	pxor    %xmm3, %xmm5
+
+	movups	160(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	144(ctx), %xmm6
+	aesdec    %xmm7, %xmm1
+	aesdec    %xmm7, %xmm2
+	aesdec    %xmm7, %xmm4
+	aesdec    %xmm7, %xmm5
+
+	movups	128(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	112(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	96(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	80(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	64(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	48(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	32(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	16(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	0(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+    aesdeclast  %xmm7, %xmm1
+    aesdeclast  %xmm7, %xmm2
+    aesdeclast  %xmm7, %xmm4
+    aesdeclast  %xmm7, %xmm5
+
+	pxor	iv, %xmm1				// 1st obuf ^= iv; 
+	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm2				// 2nd obuf ^= iv; 
+	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm4				// 3rd obuf ^= iv; 
+	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm5				// 4th obuf ^= iv; 
+	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
+	movups	%xmm1, (obuf)			// write 1st obuf
+	movups	%xmm2, 16(obuf)			// write 2nd obuf
+	movups	%xmm4, 32(obuf)			// write 3rd obuf
+	movups	%xmm5, 48(obuf)			// write 4th obuf
+
+	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE * 4; 
+	add		$64, obuf				// obuf += AES_BLOCK_SIZE * 4;	
+
+	sub		$4, num_blk				// num_blk -= 4
+	jge		0b						// if num_blk > 0, repeat the loop
+
+
+9:	add		$4, num_blk				//	post incremtn num_blk by 4
+	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
+
+	movups	176(ctx), %xmm4
+	movups	160(ctx), %xmm5
+	movups	144(ctx), %xmm6
+	movups	128(ctx), %xmm7
+
+#endif
+
+	// per-block aes_decrypt_cbc loop
+
+0:
+	movups	(ibuf), %xmm2				// tmp = ibuf
+
+	// aes_decrypt
+	pxor    %xmm3, %xmm2
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm7, %xmm2
+#if defined	__x86_64__
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm13, %xmm2
+    aesdec  %xmm14, %xmm2
+    aesdeclast  %xmm15, %xmm2
+#else
+	movups	112(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	96(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	80(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	64(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	48(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	32(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	16(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	(ctx), %xmm1
+    aesdeclast  %xmm1, %xmm2
+#endif
+
+	pxor	iv, %xmm2			// obuf ^= iv; 
+	movups	(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+	movups	%xmm2, (obuf)		// write obuf
+
+	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; 
+	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	
+	sub		$1, num_blk				// num_blk --
+	jg		0b						// if num_blk > 0, repeat the loop
+
+	jmp		L_HW_cbc_done
+
+	//
+	// aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
+	//
+
+L_decrypt_256:
+
+	cmp		$1, num_blk
+	jl		L_HW_cbc_done	
+
+	movups	224(ctx), %xmm3
+	movups	208(ctx), %xmm4
+	movups	192(ctx), %xmm5
+	movups	176(ctx), %xmm6
+	movups	160(ctx), %xmm7
+#if defined	__x86_64__
+	movups	144(ctx), %xmm8
+	movups	128(ctx), %xmm9
+	movups	112(ctx), %xmm10
+	movups	96(ctx), %xmm11
+	movups	80(ctx), %xmm12
+	movups	64(ctx), %xmm13
+	movups	48(ctx), %xmm14
+	movups	32(ctx), %xmm15
+//	movups	16(ctx), %xmm14
+//	movups	(ctx), %xmm15
+#endif
+
+#if defined	__x86_64__
+
+	sub		$4, num_blk					// pre decrement num_blk by 4
+	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code
+0:
+	movups	(ibuf), %xmm1				// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf
+	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf
+	movups	48(ibuf), %xmm15			// tmp = 4th ibuf
+
+	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+	pxor    %xmm3, %xmm14
+	pxor    %xmm3, %xmm15
+
+    aesdec  %xmm4, %xmm1
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm4, %xmm14
+    aesdec  %xmm4, %xmm15
+
+    aesdec  %xmm5, %xmm1
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm5, %xmm14
+    aesdec  %xmm5, %xmm15
+
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm14
+    aesdec  %xmm6, %xmm15
+
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm14
+    aesdec  %xmm7, %xmm15
+
+    aesdec  %xmm8, %xmm1
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm8, %xmm14
+    aesdec  %xmm8, %xmm15
+
+    aesdec  %xmm9, %xmm1
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm9, %xmm14
+    aesdec  %xmm9, %xmm15
+
+    aesdec  %xmm10, %xmm1
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm10, %xmm14
+    aesdec  %xmm10, %xmm15
+
+    aesdec  %xmm11, %xmm1
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm11, %xmm14
+    aesdec  %xmm11, %xmm15
+
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm12, %xmm14
+    aesdec  %xmm12, %xmm15
+	movups	48(ctx), %xmm12
+
+    aesdec  %xmm13, %xmm1
+    aesdec  %xmm13, %xmm2
+    aesdec  %xmm13, %xmm14
+    aesdec  %xmm13, %xmm15
+	movups	32(ctx), %xmm13
+
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm12, %xmm14
+    aesdec  %xmm12, %xmm15
+	movups	16(ctx), %xmm12
+
+    aesdec  %xmm13, %xmm1
+    aesdec  %xmm13, %xmm2
+    aesdec  %xmm13, %xmm14
+    aesdec  %xmm13, %xmm15
+	movups	(ctx), %xmm13
+
+    aesdec  %xmm12, %xmm1
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm12, %xmm14
+    aesdec  %xmm12, %xmm15
+	movups	80(ctx), %xmm12
+
+    aesdeclast  %xmm13, %xmm1
+    aesdeclast  %xmm13, %xmm2
+    aesdeclast  %xmm13, %xmm14
+    aesdeclast  %xmm13, %xmm15
+	movups	64(ctx), %xmm13
+
+	pxor	iv, %xmm1				// obuf ^= iv; 
+	movups	(ibuf), iv				// memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm2				// obuf ^= iv; 
+	movups	16(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm14				// obuf ^= iv; 
+	movups	32(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm15				// obuf ^= iv; 
+	movups	48(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+	movups	%xmm1, (obuf)			// write 1st obuf
+	movups	%xmm2, 16(obuf)			// write 2nd obuf
+	movups	%xmm14, 32(obuf)		// write 3rd obuf
+	movups	%xmm15, 48(obuf)		// write 4th obuf
+
+	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE*4; 
+	add		$64, obuf				// obuf += AES_BLOCK_SIZE*4;	
+
+	sub		$4, num_blk				// num_blk -= 4
+	jge		0b						// if num_blk > 0, repeat the loop
+
+9:	add		$4, num_blk				//	post incremtn num_blk by 4
+	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
+
+	movups	48(ctx), %xmm14
+	movups	32(ctx), %xmm15
+
+#else
+
+	sub		$4, num_blk				// pre decrement num_blk by 4
+	jl		9f						// if num_blk < 4, skip the per-pair processing code
+0:
+	movups	(ibuf), %xmm1			// tmp = 1st ibuf
+	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf
+	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf
+	movups	48(ibuf), %xmm5			// tmp = 4th ibuf
+
+	// aes_decrypt
+	// for i386, sequentially load expanded keys into xmm6/xmm7
+	movups	208(ctx), %xmm6
+	pxor    %xmm3, %xmm1
+	pxor    %xmm3, %xmm2
+	pxor    %xmm3, %xmm4
+	pxor    %xmm3, %xmm5
+
+	movups	192(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	176(ctx), %xmm6
+	aesdec  %xmm7, %xmm1
+	aesdec	%xmm7, %xmm2
+	aesdec	%xmm7, %xmm4
+	aesdec	%xmm7, %xmm5
+
+	movups	160(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	144(ctx), %xmm6
+	aesdec	%xmm7, %xmm1
+	aesdec	%xmm7, %xmm2
+	aesdec	%xmm7, %xmm4
+	aesdec	%xmm7, %xmm5
+
+	movups	128(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	112(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	96(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	80(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	64(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	48(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	32(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+	movups	16(ctx), %xmm6
+    aesdec  %xmm7, %xmm1
+    aesdec  %xmm7, %xmm2
+    aesdec  %xmm7, %xmm4
+    aesdec  %xmm7, %xmm5
+
+	movups	0(ctx), %xmm7
+    aesdec  %xmm6, %xmm1
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm6, %xmm4
+    aesdec  %xmm6, %xmm5
+
+    aesdeclast  %xmm7, %xmm1
+    aesdeclast  %xmm7, %xmm2
+    aesdeclast  %xmm7, %xmm4
+    aesdeclast  %xmm7, %xmm5
+
+	pxor	iv, %xmm1				// 1st obuf ^= iv; 
+	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm2				// 2nd obuf ^= iv; 
+	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm4				// 3rd obuf ^= iv; 
+	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
+	pxor	iv, %xmm5				// 4th obuf ^= iv; 
+	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
+	movups	%xmm1, (obuf)			// write 1st obuf
+	movups	%xmm2, 16(obuf)			// write 2nd obuf
+	movups	%xmm4, 32(obuf)			// write 3rd obuf
+	movups	%xmm5, 48(obuf)			// write 4th obuf
+
+	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE * 4; 
+	add		$64, obuf				// obuf += AES_BLOCK_SIZE * 4;	
+
+	sub		$4, num_blk				// num_blk -= 4
+	jge		0b						// if num_blk > 0, repeat the loop
+
+
+9:	add		$4, num_blk				//	post incremtn num_blk by 4
+	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code
+
+	movups	208(ctx), %xmm4
+	movups	192(ctx), %xmm5
+	movups	176(ctx), %xmm6
+	movups	160(ctx), %xmm7
+
+#endif
+
+0:
+	movups	(ibuf), %xmm2				// tmp = ibuf
+
+	// aes_decrypt
+	pxor	%xmm3, %xmm2
+    aesdec  %xmm4, %xmm2
+    aesdec  %xmm5, %xmm2
+    aesdec  %xmm6, %xmm2
+    aesdec  %xmm7, %xmm2
+#if defined	__x86_64__
+    aesdec  %xmm8, %xmm2
+    aesdec  %xmm9, %xmm2
+    aesdec  %xmm10, %xmm2
+    aesdec  %xmm11, %xmm2
+    aesdec  %xmm12, %xmm2
+    aesdec  %xmm13, %xmm2
+    aesdec  %xmm14, %xmm2
+    aesdec  %xmm15, %xmm2
+#else
+	movups	144(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	128(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	112(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	96(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	80(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	64(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	48(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	32(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+#endif
+	movups	16(ctx), %xmm1
+    aesdec  %xmm1, %xmm2
+	movups	(ctx), %xmm1
+    aesdeclast  %xmm1, %xmm2
+
+	pxor	iv, %xmm2			// obuf ^= iv; 
+	movups	(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+	movups	%xmm2, (obuf)		// write obuf
+
+	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; 
+	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	
+	sub		$1, num_blk				// num_blk --
+	jg		0b						// if num_blk > 0, repeat the loop
+
+	jmp		L_HW_cbc_done
+
+	//
+	// --------- END of aes_decrypt_cbc_hw  -------------------
+	//
diff --git a/bsd/crypto/aes/test/ReadMe.txt b/bsd/crypto/aes/test/ReadMe.txt
deleted file mode 100644
index 1329e84be..000000000
--- a/bsd/crypto/aes/test/ReadMe.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-This directory contains file and shell scripts 
-
-	tstaes.c
-	makegenarm.sh
-	makegenx86.sh
-	makeoptx86.sh
-
-that can be used to build executables. These executable are used to validate the implementation
-and to benchmark the performance of the aes functions in the kernel. This directory also serves
-as a development environment for porting of the aes functions to any new architectures.
-
-On xnu-1699.20.6 (from which we add this work), the generic aes source code sits at bsd/crypto/aes/gen. The x86_64 
-and i386 architectural optimization is given in bsd/crypto/aes/i386.
-
-After making some code corrections (aes.h and most assembly code in i386), now you can build a test executable
-that is functionally equivalent to aes in the kernel code.
-
-To generate a test executable for the aes in x86_64/i386 kernel,
-
-	$ makeoptx86.sh
-
-This will build a test executable tstaesoptx86 (x86_64/i386). The executable will automatically detects the 
-CPU clock rates. You specify the number of iterations and the number of 16-byte blocks for simulation. 
-The executable generates (random number) the test data, and calls aes_encrypt_cbc to encrypt the plain data
-into cipher data, and then calls aes_decrypt_cbc to decrypt cipher into decrypted data. Afterwards, it compares
-the decrypted data against the plain data. Should there be a mismatch, the code breaks and exit. 
-Otherwise, it measures the times the system spends on the 2 functions under test. Afterwards, it prints out
-the performance profiling data.
-
-On K5,
-
-$ tstaesoptx86 1000 2560
-device max CPU clock rate = 2659.00 MHz
-40960 bytes per cbc call
- aes_encrypt_cbc : time elapsed =   220.24 usecs,  177.37 MBytes/sec,    14.30 cycles/byte
-  best iteration : time elapsed =   218.30 usecs,  178.94 MBytes/sec,    14.17 cycles/byte
- worst iteration : time elapsed =   286.14 usecs,  136.51 MBytes/sec,    18.58 cycles/byte
-
- aes_decrypt_cbc : time elapsed =   199.85 usecs,  195.46 MBytes/sec,    12.97 cycles/byte
-  best iteration : time elapsed =   198.17 usecs,  197.12 MBytes/sec,    12.86 cycles/byte
- worst iteration : time elapsed =   228.12 usecs,  171.23 MBytes/sec,    14.81 cycles/byte
-
-On K5B (with aesni)
-
-$ tstaesoptx86 1000 256    
-device max CPU clock rate = 2400.00 MHz
-4096 bytes per cbc call
- aes_encrypt_cbc : time elapsed =     6.69 usecs,  583.67 MBytes/sec,     3.92 cycles/byte
-  best iteration : time elapsed =     6.38 usecs,  612.46 MBytes/sec,     3.74 cycles/byte
- worst iteration : time elapsed =     9.72 usecs,  401.96 MBytes/sec,     5.69 cycles/byte
-
- aes_decrypt_cbc : time elapsed =     2.05 usecs, 1902.65 MBytes/sec,     1.20 cycles/byte
-  best iteration : time elapsed =     1.96 usecs, 1997.06 MBytes/sec,     1.15 cycles/byte
- worst iteration : time elapsed =     4.60 usecs,  849.00 MBytes/sec,     2.70 cycles/byte
-
-You can also build a test executable using the generic source code for the i386/x86_64 architecture.
-
-	$ makegenx86.sh
-
-When run on K5,
-
-$ tstaesgenx86 1000 2560   
-device max CPU clock rate = 2659.00 MHz
-40960 bytes per cbc call
- aes_encrypt_cbc : time elapsed =   278.05 usecs,  140.49 MBytes/sec,    18.05 cycles/byte
-  best iteration : time elapsed =   274.63 usecs,  142.24 MBytes/sec,    17.83 cycles/byte
- worst iteration : time elapsed =   309.70 usecs,  126.13 MBytes/sec,    20.10 cycles/byte
-
- aes_decrypt_cbc : time elapsed =   265.43 usecs,  147.17 MBytes/sec,    17.23 cycles/byte
-  best iteration : time elapsed =   262.20 usecs,  148.98 MBytes/sec,    17.02 cycles/byte
- worst iteration : time elapsed =   296.19 usecs,  131.88 MBytes/sec,    19.23 cycles/byte
-
-We can see the current AES implementation in the x86_64 kernel has been improved from 17.83/17.02
-down to 14.12/12.86 cycles/byte for aes_encrypt_cbc and aes_decrypt_cbc, respectively.
-
-
- --------- iOS ---------
-
-Similarly, you can build a test executable for the aes in the armv7 kernel (which uses the generic source code)
-
-	$ makegenarm.sh
-
-Note that you need the iOS SDK installed. We can then copy this executable to iOS devices for simulation.
-
-On N88,
-
-iPhone:~ root# ./tstaesgenarm 1000 2560
-device max CPU clock rate = 600.00 MHz
-40960 bytes per cbc call
- aes_encrypt_cbc : time elapsed =  2890.18 usecs,   13.52 MBytes/sec,    42.34 cycles/byte
-  best iteration : time elapsed =  2692.00 usecs,   14.51 MBytes/sec,    39.43 cycles/byte
- worst iteration : time elapsed = 18248.33 usecs,    2.14 MBytes/sec,   267.31 cycles/byte
-
- aes_decrypt_cbc : time elapsed =  3078.20 usecs,   12.69 MBytes/sec,    45.09 cycles/byte
-  best iteration : time elapsed =  2873.33 usecs,   13.59 MBytes/sec,    42.09 cycles/byte
- worst iteration : time elapsed =  9664.79 usecs,    4.04 MBytes/sec,   141.57 cycles/byte
-
diff --git a/bsd/crypto/aes/test/makegenx86.sh b/bsd/crypto/aes/test/makegenx86.sh
deleted file mode 100755
index ea4de6f63..000000000
--- a/bsd/crypto/aes/test/makegenx86.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/ksh
-
-cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aescrypt.c -o aescrypt.o
-cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aeskey.c -o aeskey.o
-cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aestab.c -o aestab.o
-
-cc -arch i386 -arch x86_64 -Os tstaes.c aescrypt.o aeskey.o aestab.o -o tstaesgenx86
-rm -fr aescrypt.o aeskey.o aestab.o
diff --git a/bsd/crypto/aes/test/makeoptx86.sh b/bsd/crypto/aes/test/makeoptx86.sh
deleted file mode 100755
index 3732e037f..000000000
--- a/bsd/crypto/aes/test/makeoptx86.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/ksh
-
-cc -c -Os -arch i386 -arch x86_64 ../i386/AES.s -o AES.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_crypt_hw.s -o aes_crypt_hw.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_key_hw.s -o aes_key_hw.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_asm.s -o aes_modes_asm.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_hw.s -o aes_modes_hw.o
-
-cc -Os -arch i386 -arch x86_64 tstaes.c AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o -o tstaesoptx86
-rm -fr AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o
diff --git a/bsd/crypto/aes/test/tstaes.c b/bsd/crypto/aes/test/tstaes.c
deleted file mode 100644
index 9d186ee77..000000000
--- a/bsd/crypto/aes/test/tstaes.c
+++ /dev/null
@@ -1,131 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "../aes.h"
-#include <mach/mach_time.h>
-#include <sys/sysctl.h>
-
-
-aes_encrypt_ctx	encrypt_ctx;
-aes_decrypt_ctx	decrypt_ctx;
-
-size_t getFreq()
-{
-    int mib[2];
-    size_t cpufreq, len;
-    mib[0] = CTL_HW;
-    mib[1] = HW_CPU_FREQ;
-    len = sizeof(cpufreq);
-
-    sysctl(mib, 2, &cpufreq, &len, NULL, 0);
-
-    return  cpufreq;
-}
-
-
-uint32_t	cpu_freq;
-
-main(int argc, char **argv)
-{
-
-	char	*plain;
-	char	*cipher;
-	char	*decrypt;
-
-uint32_t	ITERATIONS;
-uint32_t	NUM_BLOCKS;
-uint32_t	data_size;
-
-	char 	key[32];
-	char 	iv[16];
-	int		checksum=0;
-	int		i, j, iterations;
-	uint64_t    t0, t1, t2, sum=0, max_time=0, min_time=-1, sum1=0, max_time1=0, min_time1=-1;
-    float       time, time_max, time_min, time1, time_max1, time_min1;
-
-	cpu_freq = getFreq();
-
-	if (cpu_freq == 0) {
-		fprintf(stderr, "this appears to be an iPhone device, where cpu_freq can not be detected. set to 800MHz.\n");
-		cpu_freq = 800000000;
-	} else {
-		fprintf(stderr, "device max CPU clock rate = %.2f MHz\n", cpu_freq/1.e6);
-	}
-
-    mach_timebase_info_data_t info;
-    kern_return_t err = mach_timebase_info( &info );
-
-	if (argc!=3) {
-		fprintf(stderr, "usage : %s iterations num_16bytes_block\n", argv[0]);
-		exit(1);
-	}
-	ITERATIONS = atoi(argv[1]);
-	NUM_BLOCKS = atoi(argv[2]);
-	data_size = 16*NUM_BLOCKS;
-
-	plain = malloc(data_size);
-	cipher = malloc(data_size);
-	decrypt = malloc(data_size);
-
-	if ((plain==NULL) || (cipher==NULL) || (decrypt==NULL)) {
-		fprintf(stderr,"malloc error.\n");
-		exit(1);
-	}
-
-	for (i=0;i<data_size;i++) plain[i] = random();
-	for (i=0;i<32;i++) key[i] = random();
-	for (i=0;i<16;i++) iv[i] = random();
-
-	aes_encrypt_key128(key, &encrypt_ctx);
-	aes_decrypt_key128(key, &decrypt_ctx);
-
-	for (iterations=0;iterations<ITERATIONS;iterations++) {
-		t0 = mach_absolute_time();
-
-		// encrypt
-		aes_encrypt_cbc(plain, iv, NUM_BLOCKS, cipher, &encrypt_ctx);
-
-		t1 = mach_absolute_time();
-
-		// decrypt
-		aes_decrypt_cbc(cipher, iv, NUM_BLOCKS, decrypt, &decrypt_ctx);
-
-		t2 = mach_absolute_time();
-
-		for (i=0;i<(16*NUM_BLOCKS);i++) if (plain[i]!=decrypt[i]) {
-				fprintf(stderr,"error : decrypt != plain. i = %d\n", i);
-				exit(1);
-		}
-		sum += (t1-t0);
-		sum1 += (t2-t1);
-		t2-=t1;
-		t1-=t0;
-		if (t1>max_time) max_time = t1;
-        if (t1<min_time) min_time = t1;
-		if (t2>max_time1) max_time1 = t2;
-        if (t2<min_time1) min_time1 = t2;
-	}
-
-	time = sum * 1e-9* ((double) info.numer)/((double) info.denom);
-	time_max = max_time * 1e-9* ((double) info.numer)/((double) info.denom);
-    time_min = min_time * 1e-9* ((double) info.numer)/((double) info.denom);
-
-	time1 = sum1 * 1e-9* ((double) info.numer)/((double) info.denom);
-	time_max1 = max_time1 * 1e-9* ((double) info.numer)/((double) info.denom);
-    time_min1 = min_time1 * 1e-9* ((double) info.numer)/((double) info.denom);
-
-	printf("%d bytes per cbc call\n", data_size);
-	printf(" aes_encrypt_cbc : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time/ITERATIONS,data_size*ITERATIONS/1024./1024./time, time*1.*cpu_freq/ITERATIONS/data_size);
-	printf("  best iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_min,data_size/1024./1024./time_min, time_min*1.*cpu_freq/data_size);
-    printf(" worst iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_max,data_size/1024./1024./time_max, time_max*1.*cpu_freq/data_size);
-
-	printf("\n");
-
-	printf(" aes_decrypt_cbc : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time1/ITERATIONS,data_size*ITERATIONS/1024./1024./time1, time1*1.*cpu_freq/ITERATIONS/data_size);
-	printf("  best iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_min1,data_size/1024./1024./time_min1, time_min1*1.*cpu_freq/data_size);
-    printf(" worst iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_max1,data_size/1024./1024./time_max1, time_max1*1.*cpu_freq/data_size);
-
-	free(plain);
-	free(cipher);
-	free(decrypt);
-}
diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c
index 8703ecb9b..016df24e0 100644
--- a/bsd/hfs/hfs_cnode.c
+++ b/bsd/hfs/hfs_cnode.c
@@ -77,7 +77,7 @@ int hfs_set_backingstore (struct vnode *vp, int val) {
 	int err = 0;
 	
 	cp = VTOC(vp);
-	if (vnode_isdir(vp)) {
+	if (!vnode_isreg(vp) && !vnode_isdir(vp)) {
 		return EINVAL;
 	}
 
@@ -113,7 +113,7 @@ int hfs_is_backingstore (struct vnode *vp, int *val) {
 	struct cnode *cp = NULL;
 	int err = 0;
 
-	if (!vnode_isreg(vp)) {
+	if (!vnode_isreg(vp) && !vnode_isdir(vp)) {
 		*val = 0;
 		return 0;
 	}
diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c
index 0bac8a3eb..adf02520b 100644
--- a/bsd/hfs/hfs_vfsops.c
+++ b/bsd/hfs/hfs_vfsops.c
@@ -3966,6 +3966,9 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
 	hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
 	HFS_MOUNT_UNLOCK(hfsmp, TRUE);
 
+	/* Start with a clean journal. */
+	hfs_journal_flush(hfsmp, TRUE);
+
 	/*
 	 * Enclose changes inside a transaction.
 	 */
@@ -4244,6 +4247,9 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
 	}
 	if (transaction_begun) {
 		hfs_end_transaction(hfsmp);
+		hfs_journal_flush(hfsmp, FALSE);
+		/* Just to be sure, sync all data to the disk */
+		(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
 	}
 
 	return MacToVFSError(error);
diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c
index f2c9c8711..e1f693be2 100644
--- a/bsd/kern/kern_sysctl.c
+++ b/bsd/kern/kern_sysctl.c
@@ -3321,6 +3321,12 @@ SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW | CTLFLAG_LOCK
 SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_user_wire_limit, "");
 SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_user_wire_limit, "");
 
+extern int vm_map_copy_overwrite_aligned_src_not_internal;
+extern int vm_map_copy_overwrite_aligned_src_not_symmetric;
+extern int vm_map_copy_overwrite_aligned_src_large;
+SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_not_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_not_internal, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_not_symmetric, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_not_symmetric, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_large, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_large, 0, "");
 
 
 /*
diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c
index 9aba89b96..5294122ff 100644
--- a/bsd/kern/mach_process.c
+++ b/bsd/kern/mach_process.c
@@ -129,10 +129,6 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval)
 	AUDIT_ARG(value32, uap->data);
 
 	if (uap->req == PT_DENY_ATTACH) {
-#if (DEVELOPMENT || DEBUG) && defined(__arm__)
-		if (PE_i_can_has_debugger(NULL))
-			return(0);
-#endif
 		proc_lock(p);
 		if (ISSET(p->p_lflag, P_LTRACED)) {
 			proc_unlock(p);
diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c
index 521de769e..8a6356d5a 100644
--- a/bsd/kern/uipc_syscalls.c
+++ b/bsd/kern/uipc_syscalls.c
@@ -1847,22 +1847,25 @@ sockargs(struct mbuf **mp, user_addr_t data, int buflen, int type)
 	struct mbuf *m;
 	int error;
 
-	int alloc_buflen = buflen;
+	size_t alloc_buflen = (size_t)buflen;
+	
+	if(alloc_buflen > INT_MAX/2) 
+		return (EINVAL);
 #ifdef __LP64__
 	/* The fd's in the buffer must expand to be pointers, thus we need twice as much space */
 	if(type == MT_CONTROL)
 		alloc_buflen = ((buflen - sizeof(struct cmsghdr))*2) + sizeof(struct cmsghdr);
 #endif
-	if ((u_int)alloc_buflen > MLEN) {
-		if (type == MT_SONAME && (u_int)alloc_buflen <= 112)
+	if (alloc_buflen > MLEN) {
+		if (type == MT_SONAME && alloc_buflen <= 112)
 			alloc_buflen = MLEN;		/* unix domain compat. hack */
-		else if ((u_int)alloc_buflen > MCLBYTES)
+		else if (alloc_buflen > MCLBYTES)
 			return (EINVAL);
 	}
 	m = m_get(M_WAIT, type);
 	if (m == NULL)
 		return (ENOBUFS);
-	if ((u_int)alloc_buflen > MLEN) {
+	if (alloc_buflen > MLEN) {
 		MCLGET(m, M_WAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h
index 0d9cff919..8259186d0 100644
--- a/bsd/libkern/libkern.h
+++ b/bsd/libkern/libkern.h
@@ -213,15 +213,6 @@ clz(unsigned int num)
 	);
 	return 31 ^ result;
 
-#elif __arm__ && !__thumb__ && defined(_ARM_ARCH_5)
-	unsigned int result;
-	__asm__ volatile(
-		"clz %0, %1"
-		: "=r" (result)
-		: "r" (num)
-	);
-
-	return result;
 #else
 	return num?__builtin_clz(num):__builtin_clz(0);
 #endif
diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c
index 4bb6e1c28..833b8ca34 100644
--- a/bsd/net/ntstat.c
+++ b/bsd/net/ntstat.c
@@ -1248,8 +1248,7 @@ nstat_idle_check(
 				removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED;
 				removed.hdr.context = 0;
 				removed.srcref = dead->srcref;
-				errno_t result = ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR);
-				if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result);
+				(void)ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR);
 				
 				// Put this on the list to release later
 				dead->next = dead_list;
@@ -1318,8 +1317,7 @@ nstat_control_cleanup_source(
 		removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED;
 		removed.hdr.context = 0;
 		removed.srcref = src->srcref;
-		errno_t result = ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR);
-		if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result);
+		(void)ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR);
 	}
 	
 	// Cleanup the source if we found it.
@@ -1551,7 +1549,6 @@ nstat_control_handle_add_request(
 	
 	if (result != 0)
 	{
-		printf("nstat_lookup_entry failed: %d\n", result);
 		return result;
 	}
 	
@@ -1785,10 +1782,6 @@ nstat_control_handle_query_request(
 			if (result == 0)
 			{
 				result = ctl_enqueuedata(state->kctl, state->unit, &counts, sizeof(counts), CTL_DATA_EOR);
-				if (result != 0)
-				{
-					printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result);
-				}
 			}
 			else
 			{
diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c
index 1fcafd583..f32cef303 100644
--- a/bsd/netinet/in_cksum.c
+++ b/bsd/netinet/in_cksum.c
@@ -141,38 +141,6 @@ in_pseudo(u_int a, u_int b, u_int c)
 
 }
 
-#if defined(__arm__) && __ARM_ARCH__ >= 6
-
-extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum);
-
-u_int16_t
-inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip,
-    unsigned int len)
-{
-	u_int32_t sum = 0;
-
-	/* sanity check */
-	if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) {
-		panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n",
-		    m->m_pkthdr.len, skip, len);
-	}
-
-	/* include pseudo header checksum? */
-	if (nxt != 0) {
-		struct ip *iph;
-
-		if (m->m_len < sizeof (struct ip))
-			panic("inet_cksum: bad mbuf chain");
-
-		iph = mtod(m, struct ip *);
-		sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr,
-		    htonl(len + nxt));
-	}
-
-	return (cpu_in_cksum(m, len, skip, sum));
-}
-
-#else
 
 u_int16_t
 inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip,
@@ -304,4 +272,3 @@ inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip,
 	return (~sum & 0xffff);
 }
 
-#endif
diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c
index c64150319..2052493ab 100644
--- a/bsd/netinet6/esp_input.c
+++ b/bsd/netinet6/esp_input.c
@@ -440,8 +440,8 @@ esp4_input(m, off)
 		    seq >= sav->replay->lastseq)  {
 			struct udphdr *encap_uh = (__typeof__(encap_uh))((caddr_t)ip + off);
 			if (encap_uh->uh_sport &&
-			    encap_uh->uh_sport != sav->remote_ike_port) {
-				sav->remote_ike_port = encap_uh->uh_sport;
+			    ntohs(encap_uh->uh_sport) != sav->remote_ike_port) {
+				sav->remote_ike_port = ntohs(encap_uh->uh_sport);
 			}
 		}
 		ip = esp4_input_strip_UDP_encap(m, off);
diff --git a/bsd/netinet6/in6_cksum.c b/bsd/netinet6/in6_cksum.c
index f0352eb72..77dd7e1af 100644
--- a/bsd/netinet6/in6_cksum.c
+++ b/bsd/netinet6/in6_cksum.c
@@ -131,91 +131,6 @@
 #include <machine/endian.h>
 
 
-#if defined(__arm__) && __ARM_ARCH__ >= 6
-extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum);
-
-u_int16_t
-inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off,
-    unsigned int len)
-{
-	union {
-		uint16_t words[16];
-		struct {
-			struct in6_addr ip6_src;
-			struct in6_addr ip6_dst;
-		} addrs;
-	} u;
-	const struct in6_addr *in6_src;
-	const struct in6_addr *in6_dst;
-	const struct ip6_hdr *ip6;
-	uint32_t sum;
-	const uint16_t *w;
-	const char *cp;
-
-	if (off < sizeof (struct ip6_hdr))
-		panic("inet6_cksum: offset too short for IPv6 header");
-	if (m->m_len < sizeof (struct ip6_hdr))
-		panic("inet6_cksum: mbuf too short for IPv6 header");
-
-	if (nxt == 0)
-		return (cpu_in_cksum(m, len, off, 0));
-
-	/*
-	 * Compute the equivalent of:
-	 * struct ip6_hdr_pseudo ip6;
-	 *
-	 * bzero(sizeof (*ip6));
-	 * ip6.ip6ph_nxt = nxt;
-	 * ip6.ip6ph_len = htonl(len);
-	 * ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src;
-	 * in6_clearscope(&ip6->ip6ph_src);
-	 * ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst;
-	 * in6_clearscope(&ip6->ip6ph_dst);
-	 * sum = one_add(&ip6);
-	 */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-	sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8;
-#else
-	sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt;
-#endif
-	cp = mtod(m, const char *);
-	w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src));
-	ip6 = (const void *)cp;
-	if ((uintptr_t)w % 2 == 0) {
-		in6_src = &ip6->ip6_src;
-		in6_dst = &ip6->ip6_dst;
-	} else {
-		memcpy(&u, &ip6->ip6_src, 32);
-		w = u.words;
-		in6_src = &u.addrs.ip6_src;
-		in6_dst = &u.addrs.ip6_dst;
-	}
-
-	sum += w[0];
-	if (!IN6_IS_SCOPE_EMBED(in6_src))
-		sum += w[1];
-	sum += w[2];
-	sum += w[3];
-	sum += w[4];
-	sum += w[5];
-	sum += w[6];
-	sum += w[7];
-	w += 8;
-	sum += w[0];
-	if (!IN6_IS_SCOPE_EMBED(in6_dst))
-		sum += w[1];
-	sum += w[2];
-	sum += w[3];
-	sum += w[4];
-	sum += w[5];
-	sum += w[6];
-	sum += w[7];
-
-	return (cpu_in_cksum(m, len, off, sum));
-}
-
-#else
 
 /*
  * Checksum routine for Internet Protocol family headers (Portable Version).
@@ -445,4 +360,3 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off,
 	return (~sum & 0xffff);
 }
 
-#endif
diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c
index 7a0323fde..484e47c2b 100644
--- a/bsd/nfs/nfs_vfsops.c
+++ b/bsd/nfs/nfs_vfsops.c
@@ -1575,8 +1575,12 @@ nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int ar
 	/* copy socket address */
 	if (inkernel)
 		bcopy(CAST_DOWN(void *, args.addr), &ss, args.addrlen);
-	else
-		error = copyin(args.addr, &ss, args.addrlen);
+	else {
+		if ((size_t)args.addrlen > sizeof (struct sockaddr_storage))
+			error = EINVAL;
+		else
+			error = copyin(args.addr, &ss, args.addrlen);
+	}
 	nfsmout_if(error);
 	ss.ss_len = args.addrlen;
 
diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c
index 0e8bd67dd..2bccd5bb3 100644
--- a/bsd/vfs/vfs_cluster.c
+++ b/bsd/vfs/vfs_cluster.c
@@ -231,7 +231,7 @@ uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3);
  * before we issue a synchronous write 
  */
 #define HARD_THROTTLE_MAXCNT	0
-#define HARD_THROTTLE_MAXSIZE	(32 * 1024)
+#define HARD_THROTTLE_MAXSIZE	(256 * 1024)
 
 int hard_throttle_on_root = 0;
 struct timeval priority_IO_timestamp_for_root;
diff --git a/config/MasterVersion b/config/MasterVersion
index 15cb149c8..b5a6d2aac 100644
--- a/config/MasterVersion
+++ b/config/MasterVersion
@@ -1,4 +1,4 @@
-11.2.0
+11.3.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
diff --git a/kgmacros b/kgmacros
index edb1e35db..a2c6879f8 100644
--- a/kgmacros
+++ b/kgmacros
@@ -2445,13 +2445,13 @@ define zprint_one
     set $kgm_zone = (struct zone *)$arg0
 
     showptr $kgm_zone
-    printf "  %6d ",$kgm_zone->count
+    printf "  %8d ",$kgm_zone->count
     printf "%8x ",$kgm_zone->cur_size
     printf "%8x ",$kgm_zone->max_size
-    printf "%6d ",$kgm_zone->elem_size
+    printf "%8d ",$kgm_zone->elem_size
     printf "%8x ",$kgm_zone->alloc_size
-	printf " %8d ",$kgm_zone->num_allocs
-	printf "%8d ",$kgm_zone->num_frees
+	printf " %16ld ",$kgm_zone->num_allocs
+	printf "%16ld ",$kgm_zone->num_frees
     printf "%s ",$kgm_zone->zone_name
 
     if ($kgm_zone->exhaustible)
@@ -2473,7 +2473,7 @@ end
 define zprint
     printf "ZONE      "
     showptrhdrpad
-    printf "   COUNT   TOT_SZ   MAX_SZ ELT_SZ ALLOC_SZ TOT_ALLOC TOT_FREE NAME\n"
+    printf "     COUNT   TOT_SZ   MAX_SZ   ELT_SZ ALLOC_SZ         TOT_ALLOC         TOT_FREE NAME\n"
     set $kgm_zone_ptr = (struct zone *)first_zone
     while ($kgm_zone_ptr != 0)
         zprint_one $kgm_zone_ptr
@@ -9714,12 +9714,13 @@ define zstack
 		printf "\n--------------- "
 
 		if (zrecords[$index].z_opcode == 1)
-			printf "ALLOC "
+			printf "ALLOC  "
 		else
-			printf "FREE "
+			printf "FREE  "
 		end
 
-		printf " 0x%x : index %d  :  ztime %d -------------\n", zrecords[$index].z_element, $index, zrecords[$index].z_time
+		showptr zrecords[$index].z_element
+		printf " : index %d  :  ztime %d -------------\n", $index, zrecords[$index].z_time
 
 		set $frame = 0
 
diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h
index 662021550..cb2f9896a 100644
--- a/libkern/libkern/c++/OSMetaClass.h
+++ b/libkern/libkern/c++/OSMetaClass.h
@@ -60,8 +60,6 @@ class OSSerialize;
 #if defined(__LP64__)
 /*! @parseOnly */
 #define APPLE_KEXT_LEGACY_ABI  0
-#elif defined(__arm__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define APPLE_KEXT_LEGACY_ABI  0
 #else
 #define APPLE_KEXT_LEGACY_ABI  1
 #endif
diff --git a/libsyscall/wrappers/remove-counter.c b/libsyscall/wrappers/remove-counter.c
index d6a2846d8..fe41f2757 100644
--- a/libsyscall/wrappers/remove-counter.c
+++ b/libsyscall/wrappers/remove-counter.c
@@ -31,19 +31,11 @@ static int32_t __remove_counter = 0;
 
 __uint64_t
 __get_remove_counter(void) {
-#if defined(__arm__) && !defined(_ARM_ARCH_6)
-	return __remove_counter;
-#else
 	return __sync_add_and_fetch(&__remove_counter, 0);
-#endif
 }
 
 void
 __inc_remove_counter(void)
 {
-#if defined(__arm__) && !defined(_ARM_ARCH_6)
-	__remove_counter++;
-#else
 	__sync_add_and_fetch(&__remove_counter, 1);
-#endif
 }
diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s
index 9ea9f982b..0f7bdba3a 100644
--- a/osfmk/i386/i386_lock.s
+++ b/osfmk/i386/i386_lock.s
@@ -214,9 +214,7 @@
 #define PREEMPTION_DISABLE				\
 	incl	%gs:CPU_PREEMPTION_LEVEL
 
-#if MACH_LDEBUG || 1
 #define	PREEMPTION_LEVEL_DEBUG 1	
-#endif
 #if	PREEMPTION_LEVEL_DEBUG
 #define	PREEMPTION_ENABLE				\
 	decl	%gs:CPU_PREEMPTION_LEVEL	;	\
diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c
index 2e1fbe691..604bc202f 100644
--- a/osfmk/vm/vm_map.c
+++ b/osfmk/vm/vm_map.c
@@ -1253,6 +1253,7 @@ vm_map_find_space(
 	}
 	*address = start;
 
+	assert(start < end);
 	new_entry->vme_start = start;
 	new_entry->vme_end = end;
 	assert(page_aligned(new_entry->vme_start));
@@ -1868,6 +1869,7 @@ StartAgain: ;
 			 *	new range.
 			 */
 			map->size += (end - entry->vme_end);
+			assert(entry->vme_start < end);
 			entry->vme_end = end;
 			vm_map_store_update_first_free(map, map->first_free);
 			RETURN(KERN_SUCCESS);
@@ -2971,7 +2973,7 @@ vm_map_clip_unnest(
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
-static void
+void
 vm_map_clip_start(
 	vm_map_t	map,
 	vm_map_entry_t	entry,
@@ -3038,7 +3040,9 @@ _vm_map_clip_start(
 	vm_map_entry_copy_full(new_entry, entry);
 
 	new_entry->vme_end = start;
+	assert(new_entry->vme_start < new_entry->vme_end);
 	entry->offset += (start - entry->vme_start);
+	assert(start < entry->vme_end);
 	entry->vme_start = start;
 
 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
@@ -3057,7 +3061,7 @@ _vm_map_clip_start(
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
-static void
+void
 vm_map_clip_end(
 	vm_map_t	map,
 	vm_map_entry_t	entry,
@@ -3128,8 +3132,10 @@ _vm_map_clip_end(
 	new_entry = _vm_map_entry_create(map_header);
 	vm_map_entry_copy_full(new_entry, entry);
 
+	assert(entry->vme_start < end);
 	new_entry->vme_start = entry->vme_end = end;
 	new_entry->offset += (end - entry->vme_start);
+	assert(new_entry->vme_start < new_entry->vme_end);
 
 	_vm_map_store_entry_link(map_header, entry, new_entry);
 
@@ -5876,6 +5882,12 @@ vm_map_copy_overwrite_nested(
 				copy->type = VM_MAP_COPY_ENTRY_LIST;
 				copy->offset = new_offset;
 
+				/*
+				 * XXX FBDP
+				 * this does not seem to deal with
+				 * the VM map store (R&B tree)
+				 */
+
 				total_size -= copy_size;
 				copy_size = 0;
 				/* put back remainder of copy in container */
@@ -6520,6 +6532,10 @@ vm_map_copy_overwrite_unaligned(
  *	to the above pass and make sure that no wiring is involved.
  */
 
+int vm_map_copy_overwrite_aligned_src_not_internal = 0;
+int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
+int vm_map_copy_overwrite_aligned_src_large = 0;
+
 static kern_return_t
 vm_map_copy_overwrite_aligned(
 	vm_map_t	dst_map,
@@ -6624,6 +6640,26 @@ vm_map_copy_overwrite_aligned(
 				continue;
 			}
 
+#if !CONFIG_EMBEDDED
+#define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024)	/* 64 MB */
+#define __TRADEOFF1_COPY_SIZE (128 * 1024)	/* 128 KB */
+			if (copy_entry->object.vm_object != VM_OBJECT_NULL &&
+			    copy_entry->object.vm_object->vo_size >= __TRADEOFF1_OBJ_SIZE &&
+			    copy_size <= __TRADEOFF1_COPY_SIZE) {
+				/*
+				 * Virtual vs. Physical copy tradeoff #1.
+				 *
+				 * Copying only a few pages out of a large
+				 * object:  do a physical copy instead of
+				 * a virtual copy, to avoid possibly keeping
+				 * the entire large object alive because of
+				 * those few copy-on-write pages.
+				 */
+				vm_map_copy_overwrite_aligned_src_large++;
+				goto slow_copy;
+			}
+#endif /* !CONFIG_EMBEDDED */
+
 			if (entry->alias >= VM_MEMORY_MALLOC &&
 			    entry->alias <= VM_MEMORY_MALLOC_LARGE_REUSED) {
 				vm_object_t new_object, new_shadow;
@@ -6637,6 +6673,10 @@ vm_map_copy_overwrite_aligned(
 					vm_object_lock_shared(new_object);
 				}
 				while (new_object != VM_OBJECT_NULL &&
+#if !CONFIG_EMBEDDED
+				       !new_object->true_share &&
+				       new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
+#endif /* !CONFIG_EMBEDDED */
 				       new_object->internal) {
 					new_shadow = new_object->shadow;
 					if (new_shadow == VM_OBJECT_NULL) {
@@ -6657,9 +6697,24 @@ vm_map_copy_overwrite_aligned(
 						 * let's go off the optimized
 						 * path...
 						 */
+						vm_map_copy_overwrite_aligned_src_not_internal++;
 						vm_object_unlock(new_object);
 						goto slow_copy;
 					}
+#if !CONFIG_EMBEDDED
+					if (new_object->true_share ||
+					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
+						/*
+						 * Same if there's a "true_share"
+						 * object in the shadow chain, or
+						 * an object with a non-default
+						 * (SYMMETRIC) copy strategy.
+						 */
+						vm_map_copy_overwrite_aligned_src_not_symmetric++;
+						vm_object_unlock(new_object);
+						goto slow_copy;
+					}
+#endif /* !CONFIG_EMBEDDED */
 					vm_object_unlock(new_object);
 				}
 				/*
@@ -6752,6 +6807,14 @@ vm_map_copy_overwrite_aligned(
 			kern_return_t		r;
 
 		slow_copy:
+			if (entry->needs_copy) {
+				vm_object_shadow(&entry->object.vm_object,
+						 &entry->offset,
+						 (entry->vme_end -
+						  entry->vme_start));
+				entry->needs_copy = FALSE;
+			}
+
 			dst_object = entry->object.vm_object;
 			dst_offset = entry->offset;
 
@@ -6838,7 +6901,8 @@ vm_map_copy_overwrite_aligned(
 
 			start += copy_size;
 			vm_map_lock(dst_map);
-			if (version.main_timestamp == dst_map->timestamp) {
+			if (version.main_timestamp == dst_map->timestamp &&
+			    copy_size != 0) {
 				/* We can safely use saved tmp_entry value */
 
 				vm_map_clip_end(dst_map, tmp_entry, start);
@@ -7910,6 +7974,7 @@ vm_map_copyin_common(
 		tmp_entry->vme_end = copy_addr + 
 			(tmp_entry->vme_end - tmp_entry->vme_start);
 		tmp_entry->vme_start = copy_addr;
+		assert(tmp_entry->vme_start < tmp_entry->vme_end);
 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
 	}
@@ -10000,6 +10065,7 @@ vm_map_simplify_entry(
 	    (this_entry->is_shared == FALSE)
 		) {
 		_vm_map_store_entry_unlink(&map->hdr, prev_entry);
+		assert(prev_entry->vme_start < this_entry->vme_end);
 		this_entry->vme_start = prev_entry->vme_start;
 		this_entry->offset = prev_entry->offset;
 		if (prev_entry->is_sub_map) {
@@ -11086,6 +11152,7 @@ vm_map_entry_insert(
 	new_entry->vme_end = end;
 	assert(page_aligned(new_entry->vme_start));
 	assert(page_aligned(new_entry->vme_end));
+	assert(new_entry->vme_start < new_entry->vme_end);
 
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
@@ -11288,6 +11355,7 @@ vm_map_remap_extract(
 
 		new_entry->vme_start = map_address;
 		new_entry->vme_end = map_address + tmp_size;
+		assert(new_entry->vme_start < new_entry->vme_end);
 		new_entry->inheritance = inheritance;
 		new_entry->offset = offset;
 
@@ -13203,3 +13271,85 @@ vm_map_thaw(
 	vm_map_unlock(map);
 }
 #endif
+
+#if !CONFIG_EMBEDDED
+/*
+ * vm_map_entry_should_cow_for_true_share:
+ *
+ * Determines if the map entry should be clipped and setup for copy-on-write
+ * to avoid applying "true_share" to a large VM object when only a subset is
+ * targeted.
+ *
+ * For now, we target only the map entries created for the Objective C
+ * Garbage Collector, which initially have the following properties:
+ *	- alias == VM_MEMORY_MALLOC
+ * 	- wired_count == 0
+ * 	- !needs_copy
+ * and a VM object with:
+ * 	- internal
+ * 	- copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
+ * 	- !true_share
+ * 	- vo_size == ANON_CHUNK_SIZE
+ */
+boolean_t
+vm_map_entry_should_cow_for_true_share(
+	vm_map_entry_t	entry)
+{
+	vm_object_t	object;
+
+	if (entry->is_sub_map) {
+		/* entry does not point at a VM object */
+		return FALSE;
+	}
+
+	if (entry->needs_copy) {
+		/* already set for copy_on_write: done! */
+		return FALSE;
+	}
+
+	if (entry->alias != VM_MEMORY_MALLOC) {
+		/* not tagged as an ObjectiveC's Garbage Collector entry */
+		return FALSE;
+	}
+
+	if (entry->wired_count) {
+		/* wired: can't change the map entry... */
+		return FALSE;
+	}
+
+	object = entry->object.vm_object;
+
+	if (object == VM_OBJECT_NULL) {
+		/* no object yet... */
+		return FALSE;
+	}
+
+	if (!object->internal) {
+		/* not an internal object */
+		return FALSE;
+	}
+
+	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
+		/* not the default copy strategy */
+		return FALSE;
+	}
+
+	if (object->true_share) {
+		/* already true_share: too late to avoid it */
+		return FALSE;
+	}
+
+	if (object->vo_size != ANON_CHUNK_SIZE) {
+		/* not an object created for the ObjC Garbage Collector */
+		return FALSE;
+	}
+
+	/*
+	 * All the criteria match: we have a large object being targeted for "true_share".
+	 * To limit the adverse side-effects linked with "true_share", tell the caller to
+	 * try and avoid setting up the entire object for "true_share" by clipping the
+	 * targeted range and setting it up for copy-on-write.
+	 */
+	return TRUE;
+}
+#endif /* !CONFIG_EMBEDDED */
diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h
index d27859858..d8ab731e9 100644
--- a/osfmk/vm/vm_map.h
+++ b/osfmk/vm/vm_map.h
@@ -468,6 +468,19 @@ extern kern_return_t vm_map_find_space(
 				int			flags,
 				vm_map_entry_t		*o_entry);	/* OUT */
 
+extern void vm_map_clip_start(
+	vm_map_t	map,
+	vm_map_entry_t	entry,
+	vm_map_offset_t	endaddr);
+extern void vm_map_clip_end(
+	vm_map_t	map,
+	vm_map_entry_t	entry,
+	vm_map_offset_t	endaddr);
+#if !CONFIG_EMBEDDED
+extern boolean_t vm_map_entry_should_cow_for_true_share(
+	vm_map_entry_t	entry);
+#endif /* !CONFIG_EMBEDDED */
+
 /* Lookup map entry containing or the specified address in the given map */
 extern boolean_t	vm_map_lookup_entry(
 				vm_map_t		map,
diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c
index 58148a964..ccfcd062f 100644
--- a/osfmk/vm/vm_map_store.c
+++ b/osfmk/vm/vm_map_store.c
@@ -101,6 +101,7 @@ void	vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_
 void
 _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_where, vm_map_entry_t entry)
 {
+	assert(entry->vme_start < entry->vme_end);
 	vm_map_store_entry_link_ll(mapHdr, after_where, entry);
 #ifdef VM_MAP_STORE_USE_RB
 	vm_map_store_entry_link_rb(mapHdr, after_where, entry);
diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c
index 1c0138d82..2f7d54e3c 100644
--- a/osfmk/vm/vm_object.c
+++ b/osfmk/vm/vm_object.c
@@ -3894,6 +3894,10 @@ vm_object_shadow(
 	register vm_object_t	result;
 
 	source = *object;
+	assert(source != VM_OBJECT_NULL);
+	if (source == VM_OBJECT_NULL)
+		return FALSE;
+
 #if 0
 	/*
 	 * XXX FBDP
diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c
index acf4d64bd..0761db5ef 100644
--- a/osfmk/vm/vm_pageout.c
+++ b/osfmk/vm/vm_pageout.c
@@ -3956,10 +3956,30 @@ vm_map_create_upl(
 
 			return KERN_SUCCESS;
 		}
+
+		if (entry->is_sub_map) {
+			vm_map_t	submap;
+
+			submap = entry->object.sub_map;
+			local_start = entry->vme_start;
+			local_offset = entry->offset;
+
+			vm_map_reference(submap);
+			vm_map_unlock_read(map);
+
+			ret = vm_map_create_upl(submap, 
+						local_offset + (offset - local_start), 
+						upl_size, upl, page_list, count, flags);
+			vm_map_deallocate(submap);
+
+			return ret;
+		}
+
 	        if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
         		if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
                			*upl_size = MAX_UPL_SIZE * PAGE_SIZE;
 		}
+
 		/*
 		 *      Create an object if necessary.
 		 */
@@ -3978,6 +3998,42 @@ vm_map_create_upl(
 				vm_map_unlock_read(map);
 				return KERN_PROTECTION_FAILURE;
 			}
+
+#if !CONFIG_EMBEDDED
+			local_object = entry->object.vm_object;
+			if (vm_map_entry_should_cow_for_true_share(entry) &&
+			    local_object->vo_size > *upl_size &&
+			    *upl_size != 0) {
+				vm_prot_t	prot;
+
+				/*
+				 * Set up the targeted range for copy-on-write to avoid
+				 * applying true_share/copy_delay to the entire object.
+				 */
+
+				if (vm_map_lock_read_to_write(map)) {
+					goto REDISCOVER_ENTRY;
+				}
+
+				vm_map_clip_start(map, entry, vm_map_trunc_page(offset));
+				vm_map_clip_end(map, entry, vm_map_round_page(offset + *upl_size));
+				prot = entry->protection & ~VM_PROT_WRITE;
+				if (override_nx(map, entry->alias) && prot)
+					prot |= VM_PROT_EXECUTE;
+				vm_object_pmap_protect(local_object,
+						       entry->offset,
+						       entry->vme_end - entry->vme_start,
+						       ((entry->is_shared || map->mapped)
+							? PMAP_NULL
+							: map->pmap),
+						       entry->vme_start,
+						       prot);
+				entry->needs_copy = TRUE;
+
+				vm_map_lock_write_to_read(map);
+			}
+#endif /* !CONFIG_EMBEDDED */
+
 			if (entry->needs_copy)  {
 				/*
 				 * Honor copy-on-write for COPY_SYMMETRIC
@@ -4012,23 +4068,6 @@ vm_map_create_upl(
 				goto REDISCOVER_ENTRY;
 			}
 		}
-		if (entry->is_sub_map) {
-			vm_map_t	submap;
-
-			submap = entry->object.sub_map;
-			local_start = entry->vme_start;
-			local_offset = entry->offset;
-
-			vm_map_reference(submap);
-			vm_map_unlock_read(map);
-
-			ret = vm_map_create_upl(submap, 
-						local_offset + (offset - local_start), 
-						upl_size, upl, page_list, count, flags);
-			vm_map_deallocate(submap);
-
-			return ret;
-		}
 		if (sync_cow_data) {
 			if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
 				local_object = entry->object.vm_object;
diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c
index de18c16a1..8271d71b2 100644
--- a/osfmk/vm/vm_user.c
+++ b/osfmk/vm/vm_user.c
@@ -1833,6 +1833,8 @@ mach_make_memory_entry_64(
 	vm_prot_t		original_protections, mask_protections;
 	unsigned int		wimg_mode;
 
+	boolean_t		force_shadow = FALSE;
+
 	if (((permission & 0x00FF0000) &
 	     ~(MAP_MEM_ONLY |
 	       MAP_MEM_NAMED_CREATE |
@@ -2173,6 +2175,35 @@ mach_make_memory_entry_64(
 			}
 		}
 
+#if !CONFIG_EMBEDDED
+		if (vm_map_entry_should_cow_for_true_share(map_entry) &&
+		    object->vo_size > map_size &&
+		    map_size != 0) {
+			/*
+			 * Set up the targeted range for copy-on-write to
+			 * limit the impact of "true_share"/"copy_delay" to
+			 * that range instead of the entire VM object...
+			 */
+			
+			vm_object_unlock(object);
+			if (vm_map_lock_read_to_write(target_map)) {
+				vm_object_deallocate(object);
+				target_map = original_map;
+				goto redo_lookup;
+			}
+
+			vm_map_clip_start(target_map, map_entry, vm_map_trunc_page(offset));
+			vm_map_clip_end(target_map, map_entry, vm_map_round_page(offset) + map_size);
+			force_shadow = TRUE;
+
+			map_size = map_entry->vme_end - map_entry->vme_start;
+			total_size = map_size;
+
+			vm_map_lock_write_to_read(target_map);
+			vm_object_lock(object);
+		}
+#endif /* !CONFIG_EMBEDDED */
+
 		if(object->internal) {
 	   		/* vm_map_lookup_locked will create a shadow if   */
 		 	/* needs_copy is set but does not check for the   */
@@ -2180,9 +2211,11 @@ mach_make_memory_entry_64(
 			/* set up an object which will not be pulled from */
 			/* under us.  */
 
-	      		if ((map_entry->needs_copy  || object->shadowed ||
-			     (object->vo_size > total_size))
-					&& !object->true_share) {
+	      		if (force_shadow ||
+			    ((map_entry->needs_copy  ||
+			      object->shadowed ||
+			      (object->vo_size > total_size)) &&
+			     !object->true_share)) {
 				/*
 				 * We have to unlock the VM object before
 				 * trying to upgrade the VM map lock, to
diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s
index fe6cb1295..50bc8b991 100644
--- a/osfmk/x86_64/idt64.s
+++ b/osfmk/x86_64/idt64.s
@@ -268,14 +268,13 @@ L_32bit_dispatch: /* 32-bit user task */
 	mov	%eax, R32_EIP(%rsp)
 	mov	ISC32_RFLAGS(%rsp), %eax
 	mov	%eax, R32_EFLAGS(%rsp)
-	mov	ISC32_CS(%rsp), %esi		/* %esi := %cs for later */
-
-	mov	%esi, R32_CS(%rsp)
 	mov	ISC32_RSP(%rsp), %eax
 	mov	%eax, R32_UESP(%rsp)
 	mov	ISC32_SS(%rsp), %eax
 	mov	%eax, R32_SS(%rsp)
 L_32bit_dispatch_after_fault:
+	mov	ISC32_CS(%rsp), %esi		/* %esi := %cs for later */
+	mov	%esi, R32_CS(%rsp)
 	mov	ISC32_TRAPNO(%rsp), %ebx	/* %ebx := trapno for later */
 	mov	%ebx, R32_TRAPNO(%rsp)
 	mov	ISC32_ERR(%rsp), %eax
diff --git a/security/mac_base.c b/security/mac_base.c
index 1b67d3c0e..33dd04457 100644
--- a/security/mac_base.c
+++ b/security/mac_base.c
@@ -167,9 +167,6 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW | CTLFLAG_LOCKED,
 	&mac_label_mbufs, 0, "Label all MBUFs");
 #endif
 
-#if !defined(CONFIG_MACF_ALWAYS_LABEL_MBUF) && 0
-static int	mac_labelmbufs = 0;
-#endif
 
 /*
  * Flag to indicate whether or not we should allocate label storage for
@@ -744,26 +741,6 @@ mac_policy_removefrom_labellist(mac_policy_handle_t handle)
 static void
 mac_policy_updateflags(void)
 {
-#if !defined(CONFIG_MACF_ALWAYS_LABEL_MBUF) && 0 /* port to new list style */
-
-	struct mac_policy_conf *tmpc;
-	int labelmbufs;
-
-	mac_policy_assert_exclusive();
-
-	labelmbufs = 0;
-
-	/* XXX - convert to new list structure */
-	LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
-		if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
-			labelmbufs++;
-	}
-	LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
-		if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
-			labelmbufs++;
-	}
-	mac_labelmbufs = (labelmbufs != 0);
-#endif
 }
 
 static __inline void