gnutls-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[SCM] GNU gnutls branch, master, updated. gnutls_3_0_17-14-g9567d93


From: Nikos Mavrogiannopoulos
Subject: [SCM] GNU gnutls branch, master, updated. gnutls_3_0_17-14-g9567d93
Date: Mon, 19 Mar 2012 21:58:37 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU gnutls".

http://git.savannah.gnu.org/cgit/gnutls.git/commit/?id=9567d93c07f87ecb5c8560b7a45125de28710bc1

The branch, master has been updated
       via  9567d93c07f87ecb5c8560b7a45125de28710bc1 (commit)
      from  abbfc182f738c654ebeaf75cf6893acc0947699b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 9567d93c07f87ecb5c8560b7a45125de28710bc1
Author: Nikos Mavrogiannopoulos <address@hidden>
Date:   Mon Mar 19 22:55:14 2012 +0100

    updated openssl code

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                               |    2 +
 devel/perlasm/e_padlock-x86.pl                     |  104 +++++++--
 devel/perlasm/e_padlock-x86_64.pl                  |  178 ++++++++++-----
 devel/perlasm/ghash-x86.pl                         |   28 ++--
 lib/accelerated/x86/README                         |    4 +-
 lib/accelerated/x86/coff/padlock-x86-64-coff.s     |  162 +++++++++++----
 lib/accelerated/x86/coff/padlock-x86-coff.s        |  232 ++++++++++++++------
 lib/accelerated/x86/elf/padlock-x86-64.s           |  162 +++++++++++----
 lib/accelerated/x86/license.txt                    |    2 +-
 lib/accelerated/x86/macosx/padlock-x86-64-macosx.s |  162 +++++++++++----
 lib/accelerated/x86/macosx/padlock-x86-macosx.s    |  234 ++++++++++++++------
 11 files changed, 924 insertions(+), 346 deletions(-)

diff --git a/NEWS b/NEWS
index 27a258c..93fa1ab 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,8 @@ See the end for copying conditions.
 ** certtool: Avoid a Y2K38 bug when generating certificates.
 Patch by Robert Millan.
 
+** libgnutls: Updated assembler files.
+
 ** libgnutls: Time in generated certificates is stored
 as GeneralizedTime instead of UTCTime (which only stores
 2 digits of a year).
diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl
index 7a52528..71ecad3 100644
--- a/devel/perlasm/e_padlock-x86.pl
+++ b/devel/perlasm/e_padlock-x86.pl
@@ -37,7 +37,7 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
 
-%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
+%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata
 $PADLOCK_CHUNK=512;    # Must be a power of 2 larger than 16
 
 $ctx="edx";
@@ -188,10 +188,6 @@ my ($mode,$opcode) = @_;
        &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
                                        } else {
        &xor    ("ebx","ebx");
-    if ($PADLOCK_MARGIN{$mode}) {
-       &cmp    ($len,$PADLOCK_MARGIN{$mode});
-       &jbe    (&label("${mode}_short"));
-    }
        &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
        &jnz    (&label("${mode}_aligned"));
        &test   ($out,0x0f);
@@ -212,7 +208,27 @@ my ($mode,$opcode) = @_;
        &neg    ("eax");
        &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
        &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
+       &mov    ("eax",$PADLOCK_CHUNK);
+       &cmovz  ($chunk,"eax");                 # chunk=chunk?:PADLOCK_CHUNK
+       &mov    ("eax","ebp");
+       &and    ("ebp",-16);
        &and    ("esp",-16);
+       &mov    (&DWP(16,"ebp"),"eax");
+    if ($PADLOCK_PREFETCH{$mode}) {
+       &cmp    ($len,$chunk);
+       &ja     (&label("${mode}_loop"));
+       &mov    ("eax",$inp);           # check if prefetch crosses page
+       &cmp    ("ebp","esp");
+       &cmove  ("eax",$out);
+       &add    ("eax",$len);
+       &neg    ("eax");
+       &and    ("eax",0xfff);          # distance to page boundary
+       &cmp    ("eax",$PADLOCK_PREFETCH{$mode});
+       &mov    ("eax",-$PADLOCK_PREFETCH{$mode});
+       &cmovae ("eax",$chunk);         # mask=distance<prefetch?-prefetch:-1
+       &and    ($chunk,"eax");
+       &jz     (&label("${mode}_unaligned_tail"));
+    }
        &jmp    (&label("${mode}_loop"));
 
 &set_label("${mode}_loop",16);
@@ -276,8 +292,8 @@ my ($mode,$opcode) = @_;
        &test   ($out,0x0f);
        &jz     (&label("${mode}_out_aligned"));
        &mov    ($len,$chunk);
-       &shr    ($len,2);
        &lea    ($inp,&DWP(0,"esp"));
+       &shr    ($len,2);
        &data_byte(0xf3,0xa5);                  # rep movsl
        &sub    ($out,$chunk);
 &set_label("${mode}_out_aligned");
@@ -288,7 +304,30 @@ my ($mode,$opcode) = @_;
        &add    ($inp,$chunk);
        &sub    ($len,$chunk);
        &mov    ($chunk,$PADLOCK_CHUNK);
+    if (!$PADLOCK_PREFETCH{$mode}) {
        &jnz    (&label("${mode}_loop"));
+    } else {
+       &jz     (&label("${mode}_break"));
+       &cmp    ($len,$chunk);
+       &jae    (&label("${mode}_loop"));
+
+&set_label("${mode}_unaligned_tail");
+       &xor    ("eax","eax");
+       &cmp    ("esp","ebp");
+       &cmove  ("eax",$len);
+       &sub    ("esp","eax");                  # alloca
+       &mov    ("eax", $out);                  # save parameters
+       &mov    ($chunk,$len);
+       &shr    ($len,2);
+       &lea    ($out,&DWP(0,"esp"));
+       &data_byte(0xf3,0xa5);                  # rep movsl
+       &mov    ($inp,"esp");
+       &mov    ($out,"eax");                   # restore parameters
+       &mov    ($len,$chunk);
+       &jmp    (&label("${mode}_loop"));
+
+&set_label("${mode}_break",16);
+    }
                                                if ($mode ne "ctr32") {
        &cmp    ("esp","ebp");
        &je     (&label("${mode}_done"));
@@ -302,28 +341,24 @@ my ($mode,$opcode) = @_;
        &ja     (&label("${mode}_bzero"));
 
 &set_label("${mode}_done");
+       &mov    ("ebp",&DWP(16,"ebp"));
        &lea    ("esp",&DWP(24,"ebp"));
                                                if ($mode ne "ctr32") {
        &jmp    (&label("${mode}_exit"));
 
-&set_label("${mode}_short",16);
-       &xor    ("eax","eax");
-       &lea    ("ebp",&DWP(-24,"esp"));
-       &sub    ("eax",$len);
-       &lea    ("esp",&DWP(0,"eax","ebp"));
-       &and    ("esp",-16);
-       &xor    ($chunk,$chunk);
-&set_label("${mode}_short_copy");
-       &movups ("xmm0",&QWP(0,$inp,$chunk));
-       &lea    ($chunk,&DWP(16,$chunk));
-       &cmp    ($len,$chunk);
-       &movaps (&QWP(-16,"esp",$chunk),"xmm0");
-       &ja     (&label("${mode}_short_copy"));
-       &mov    ($inp,"esp");
-       &mov    ($chunk,$len);
-       &jmp    (&label("${mode}_loop"));
-
 &set_label("${mode}_aligned",16);
+    if ($PADLOCK_PREFETCH{$mode}) {
+       &lea    ("ebp",&DWP(0,$inp,$len));
+       &neg    ("ebp");
+       &and    ("ebp",0xfff);                  # distance to page boundary
+       &xor    ("eax","eax");
+       &cmp    ("ebp",$PADLOCK_PREFETCH{$mode});
+       &mov    ("ebp",$PADLOCK_PREFETCH{$mode}-1);
+       &cmovae ("ebp","eax");
+       &and    ("ebp",$len);                   # remainder
+       &sub    ($len,"ebp");
+       &jz     (&label("${mode}_aligned_tail"));
+    }
        &lea    ("eax",&DWP(-16,$ctx));         # ivp
        &lea    ("ebx",&DWP(16,$ctx));          # key
        &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
@@ -332,6 +367,29 @@ my ($mode,$opcode) = @_;
        &movaps ("xmm0",&QWP(0,"eax"));
        &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
                                                }
+    if ($PADLOCK_PREFETCH{$mode}) {
+       &test   ("ebp","ebp");
+       &jz     (&label("${mode}_exit"));
+
+&set_label("${mode}_aligned_tail");
+       &mov    ($len,"ebp");
+       &lea    ("ebp",&DWP(-24,"esp"));
+       &mov    ("esp","ebp");
+       &mov    ("eax","ebp");
+       &sub    ("esp",$len);
+       &and    ("ebp",-16);
+       &and    ("esp",-16);
+       &mov    (&DWP(16,"ebp"),"eax");
+       &mov    ("eax", $out);                  # save parameters
+       &mov    ($chunk,$len);
+       &shr    ($len,2);
+       &lea    ($out,&DWP(0,"esp"));
+       &data_byte(0xf3,0xa5);                  # rep movsl
+       &mov    ($inp,"esp");
+       &mov    ($out,"eax");                   # restore parameters
+       &mov    ($len,$chunk);
+       &jmp    (&label("${mode}_loop"));
+    }
 &set_label("${mode}_exit");                    }
        &mov    ("eax",1);
        &lea    ("esp",&DWP(4,"esp"));          # popf
diff --git a/devel/perlasm/e_padlock-x86_64.pl 
b/devel/perlasm/e_padlock-x86_64.pl
index cbffb9d..4d71d06 100644
--- a/devel/perlasm/e_padlock-x86_64.pl
+++ b/devel/perlasm/e_padlock-x86_64.pl
@@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";
 
 $code=".text\n";
 
-%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64);        # prefetch errata
+%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32);      # prefetch errata
 $PADLOCK_CHUNK=512;    # Must be a power of 2 between 32 and 2^20
 
 $ctx="%rdx";
@@ -285,17 +285,6 @@ padlock_${mode}_encrypt:
        lea     16($ctx),$ctx           # control word
        xor     %eax,%eax
        xor     %ebx,%ebx
-___
-# Formally speaking correct condtion is $len<=$margin and $inp+$margin
-# crosses page boundary [and next page is unreadable]. But $inp can
-# be unaligned in which case data can be copied to $out if latter is
-# aligned, in which case $out+$margin has to be checked. Covering all
-# cases appears more complicated than just copying short input...
-$code.=<<___   if ($PADLOCK_MARGIN{$mode});
-       cmp     \$$PADLOCK_MARGIN{$mode},$len
-       jbe     .L${mode}_short
-___
-$code.=<<___;
        testl   \$`1<<5`,($ctx)         # align bit in control word
        jnz     .L${mode}_aligned
        test    \$0x0f,$out
@@ -315,6 +304,8 @@ $code.=<<___;
        neg     %rax
        and     \$$PADLOCK_CHUNK-1,$chunk       # chunk%=PADLOCK_CHUNK
        lea     (%rax,%rbp),%rsp
+       mov     \$$PADLOCK_CHUNK,%rax
+       cmovz   %rax,$chunk                     # chunk=chunk?:PADLOCK_CHUNK
 ___
 $code.=<<___                           if ($mode eq "ctr32");
 .L${mode}_reenter:
@@ -322,10 +313,27 @@ $code.=<<___                              if ($mode eq 
"ctr32");
        bswap   %eax
        neg     %eax
        and     \$`$PADLOCK_CHUNK/16-1`,%eax
-       jz      .L${mode}_loop
+       mov     \$$PADLOCK_CHUNK,$chunk
        shl     \$4,%eax
+       cmovz   $chunk,%rax
        cmp     %rax,$len
        cmova   %rax,$chunk             # don't let counter cross PADLOCK_CHUNK
+       cmovbe  $len,$chunk
+___
+$code.=<<___                           if ($PADLOCK_PREFETCH{$mode});
+       cmp     $chunk,$len
+       ja      .L${mode}_loop
+       mov     $inp,%rax               # check if prefetch crosses page
+       cmp     %rsp,%rbp
+       cmove   $out,%rax
+       add     $len,%rax
+       neg     %rax
+       and     \$0xfff,%rax            # distance to page boundary
+       cmp     \$$PADLOCK_PREFETCH{$mode},%rax
+       mov     \$-$PADLOCK_PREFETCH{$mode},%rax
+       cmovae  $chunk,%rax             # mask=distance<prefetch?-prefetch:-1
+       and     %rax,$chunk
+       jz      .L${mode}_unaligned_tail
 ___
 $code.=<<___;
        jmp     .L${mode}_loop
@@ -360,12 +368,12 @@ ___
 $code.=<<___                           if ($mode eq "ctr32");
        mov     -4($ctx),%eax           # pull 32-bit counter
        test    \$0xffff0000,%eax
-       jnz     .L${mode}_no_corr
+       jnz     .L${mode}_no_carry
        bswap   %eax
        add     \$0x10000,%eax
        bswap   %eax
        mov     %eax,-4($ctx)
-.L${mode}_no_corr:
+.L${mode}_no_carry:
 ___
 $code.=<<___;
        mov     %r8,$out                # restore paramters
@@ -373,8 +381,8 @@ $code.=<<___;
        test    \$0x0f,$out
        jz      .L${mode}_out_aligned
        mov     $chunk,$len
-       shr     \$3,$len
        lea     (%rsp),$inp
+       shr     \$3,$len
        .byte   0xf3,0x48,0xa5          # rep movsq
        sub     $chunk,$out
 .L${mode}_out_aligned:
@@ -384,9 +392,52 @@ $code.=<<___;
        add     $chunk,$inp
        sub     $chunk,$len
        mov     \$$PADLOCK_CHUNK,$chunk
+___
+                                       if (!$PADLOCK_PREFETCH{$mode}) {
+$code.=<<___;
        jnz     .L${mode}_loop
-
+___
+                                       } else {
+$code.=<<___;
+       jz      .L${mode}_break
+       cmp     $chunk,$len
+       jae     .L${mode}_loop
+___
+$code.=<<___                           if ($mode eq "ctr32");
+       mov     $len,$chunk
+       mov     $inp,%rax               # check if prefetch crosses page
        cmp     %rsp,%rbp
+       cmove   $out,%rax
+       add     $len,%rax
+       neg     %rax
+       and     \$0xfff,%rax            # distance to page boundary
+       cmp     \$$PADLOCK_PREFETCH{$mode},%rax
+       mov     \$-$PADLOCK_PREFETCH{$mode},%rax
+       cmovae  $chunk,%rax
+       and     %rax,$chunk
+       jnz     .L${mode}_loop
+___
+$code.=<<___;
+.L${mode}_unaligned_tail:
+       xor     %eax,%eax
+       cmp     %rsp,%rbp
+       cmove   $len,%rax
+       mov     $out,%r8                # save parameters
+       mov     $len,$chunk
+       sub     %rax,%rsp               # alloca
+       shr     \$3,$len
+       lea     (%rsp),$out
+       .byte   0xf3,0x48,0xa5          # rep movsq
+       mov     %rsp,$inp
+       mov     %r8, $out               # restore parameters
+       mov     $chunk,$len
+       jmp     .L${mode}_loop
+.align 16
+.L${mode}_break:
+___
+                                       }
+$code.=<<___;
+       cmp     %rbp,%rsp
        je      .L${mode}_done
 
        pxor    %xmm0,%xmm0
@@ -400,70 +451,87 @@ $code.=<<___;
 .L${mode}_done:
        lea     (%rbp),%rsp
        jmp     .L${mode}_exit
-___
-$code.=<<___ if ($PADLOCK_MARGIN{$mode});
-.align 16
-.L${mode}_short:
-       mov     %rsp,%rbp
-       sub     $len,%rsp
-       xor     $chunk,$chunk
-.L${mode}_short_copy:
-       movups  ($inp,$chunk),%xmm0
-       lea     16($chunk),$chunk
-       cmp     $chunk,$len
-       movaps  %xmm0,-16(%rsp,$chunk)
-       ja      .L${mode}_short_copy
-       mov     %rsp,$inp
-       mov     $len,$chunk
-       jmp     .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
-___
-$code.=<<___;
+
 .align 16
 .L${mode}_aligned:
 ___
 $code.=<<___                           if ($mode eq "ctr32");
        mov     -4($ctx),%eax           # pull 32-bit counter
-       mov     \$`16*0x10000`,$chunk
        bswap   %eax
-       cmp     $len,$chunk
-       cmova   $len,$chunk
        neg     %eax
        and     \$0xffff,%eax
-       jz      .L${mode}_aligned_loop
+       mov     \$`16*0x10000`,$chunk
        shl     \$4,%eax
+       cmovz   $chunk,%rax
        cmp     %rax,$len
        cmova   %rax,$chunk             # don't let counter cross 2^16
-       jmp     .L${mode}_aligned_loop
-.align 16
+       cmovbe  $len,$chunk
+       jbe     .L${mode}_aligned_skip
+
 .L${mode}_aligned_loop:
-       cmp     $len,$chunk
-       cmova   $len,$chunk
        mov     $len,%r10               # save parameters
        mov     $chunk,$len
        mov     $chunk,%r11
-___
-$code.=<<___;
+
        lea     -16($ctx),%rax          # ivp
        lea     16($ctx),%rbx           # key
        shr     \$4,$len                # len/=AES_BLOCK_SIZE
        .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
-___
-$code.=<<___                           if ($mode !~ /ecb|ctr/);
-       movdqa  (%rax),%xmm0
-       movdqa  %xmm0,-16($ctx)         # copy [or refresh] iv
-___
-$code.=<<___                           if ($mode eq "ctr32");
+
        mov     -4($ctx),%eax           # pull 32-bit counter
        bswap   %eax
        add     \$0x10000,%eax
        bswap   %eax
        mov     %eax,-4($ctx)
 
-       mov     %r11,$chunk             # restore paramters
-       mov     %r10,$len
-       sub     $chunk,$len
+       mov     %r10,$len               # restore paramters
+       sub     %r11,$len
        mov     \$`16*0x10000`,$chunk
-       jnz     .L${mode}_aligned_loop
+       jz      .L${mode}_exit
+       cmp     $chunk,$len
+       jae     .L${mode}_aligned_loop
+
+.L${mode}_aligned_skip:
+___
+$code.=<<___                           if ($PADLOCK_PREFETCH{$mode});
+       lea     ($inp,$len),%rbp
+       neg     %rbp
+       and     \$0xfff,%rbp            # distance to page boundary
+       xor     %eax,%eax
+       cmp     \$$PADLOCK_PREFETCH{$mode},%rbp
+       mov     \$$PADLOCK_PREFETCH{$mode}-1,%rbp
+       cmovae  %rax,%rbp
+       and     $len,%rbp               # remainder
+       sub     %rbp,$len
+       jz      .L${mode}_aligned_tail
+___
+$code.=<<___;
+       lea     -16($ctx),%rax          # ivp
+       lea     16($ctx),%rbx           # key
+       shr     \$4,$len                # len/=AES_BLOCK_SIZE
+       .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
+___
+$code.=<<___                           if ($mode !~ /ecb|ctr/);
+       movdqa  (%rax),%xmm0
+       movdqa  %xmm0,-16($ctx)         # copy [or refresh] iv
+___
+$code.=<<___                           if ($PADLOCK_PREFETCH{$mode});
+       test    %rbp,%rbp               # check remainder
+       jz      .L${mode}_exit
+
+.L${mode}_aligned_tail:
+       mov     $out,%r8
+       mov     %rbp,$chunk
+       mov     %rbp,$len
+       lea     (%rsp),%rbp
+       sub     $len,%rsp
+       shr     \$3,$len
+       lea     (%rsp),$out
+       .byte   0xf3,0x48,0xa5          # rep movsq     
+       lea     (%r8),$out
+       lea     (%rsp),$inp
+       mov     $chunk,$len
+       jmp     .L${mode}_loop
 ___
 $code.=<<___;
 .L${mode}_exit:
diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl
index 1b9adfb..2a1819c 100644
--- a/devel/perlasm/ghash-x86.pl
+++ b/devel/perlasm/ghash-x86.pl
@@ -12,14 +12,14 @@
 # The module implements "4-bit" GCM GHASH function and underlying
 # single multiplication operation in GF(2^128). "4-bit" means that it
 # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
 # "528B" variant of "4-bit" method utilizing additional 256+16 bytes
 # of per-key storage [+512 bytes shared table]. Performance results
 # are for streamed GHASH subroutine and are expressed in cycles per
 # processed byte, less is better:
 #
-#              gcc 2.95.3(*)   MMX assembler   x86 assembler
+#              gcc 2.95.3(*)   SSE assembler   x86 assembler
 #
 # Pentium      105/111(**)     -               50
 # PIII         68 /75          12.2            24
@@ -30,7 +30,7 @@
 # (*)  gcc 3.4.x was observed to generate few percent slower code,
 #      which is one of reasons why 2.95.3 results were chosen,
 #      another reason is lack of 3.4.x results for older CPUs;
-#      comparison with MMX results is not completely fair, because C
+#      comparison with SSE results is not completely fair, because C
 #      results are for vanilla "256B" implementation, while
 #      assembler results are for "528B";-)
 # (**) second number is result for code compiled with -fPIC flag,
@@ -40,8 +40,8 @@
 #
 # To summarize, it's >2-5 times faster than gcc-generated code. To
 # anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
 
 # May 2010
 #
@@ -331,7 +331,7 @@ if (!$x86only) {{{
 
 &static_label("rem_4bit");
 
-if (0) {{      # "May" MMX version is kept for reference...
+if (!$sse2) {{ # pure-MMX "May" version...
 
 $S=12;         # shift factor for rem_4bit
 
@@ -1273,13 +1273,6 @@ my ($Xhi,$Xi)address@hidden;
 &set_label("bswap",64);
        &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
        &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}}     # $sse2
-
-&set_label("rem_4bit",64);
-       &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
-       &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
-       &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
-       &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
 &set_label("rem_8bit",64);
        &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
        &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1306,13 @@ my ($Xhi,$Xi)address@hidden;
        &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
        &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
        &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}     # $sse2
+
+&set_label("rem_4bit",64);
+       &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+       &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+       &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+       &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
 }}}    # !$x86only
 
 &asciz("GHASH for x86, CRYPTOGAMS by <address@hidden>");
diff --git a/lib/accelerated/x86/README b/lib/accelerated/x86/README
index 0dd5cb9..ca3c546 100644
--- a/lib/accelerated/x86/README
+++ b/lib/accelerated/x86/README
@@ -1,4 +1,4 @@
-The AES-NI and Padlock implementation by Andy Polyakov is not part of the 
-GnuTLS library, but is used with GnuTLS. Its license is included in 
+The AES-NI and Padlock implementation by Andy Polyakov are not part of the 
+GnuTLS library, but is used with GnuTLS. Their license is included in 
 license.txt.
 
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s 
b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
index b69b332..9f658ee 100644
--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
@@ -354,8 +354,6 @@ padlock_ecb_encrypt:
        leaq    16(%rdx),%rdx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpq    $128,%rcx
-       jbe     .Lecb_short
        testl   $32,(%rdx)
        jnz     .Lecb_aligned
        testq   $15,%rdi
@@ -375,6 +373,21 @@ padlock_ecb_encrypt:
        negq    %rax
        andq    $512-1,%rbx
        leaq    (%rax,%rbp,1),%rsp
+       movq    $512,%rax
+       cmovzq  %rax,%rbx
+       cmpq    %rbx,%rcx
+       ja      .Lecb_loop
+       movq    %rsi,%rax
+       cmpq    %rsp,%rbp
+       cmoveq  %rdi,%rax
+       addq    %rcx,%rax
+       negq    %rax
+       andq    $4095,%rax
+       cmpq    $128,%rax
+       movq    $-128,%rax
+       cmovaeq %rbx,%rax
+       andq    %rax,%rbx
+       jz      .Lecb_unaligned_tail
        jmp     .Lecb_loop
 .p2align       4
 .Lecb_loop:
@@ -404,8 +417,8 @@ padlock_ecb_encrypt:
        testq   $15,%rdi
        jz      .Lecb_out_aligned
        movq    %rbx,%rcx
-       shrq    $3,%rcx
        leaq    (%rsp),%rsi
+       shrq    $3,%rcx
 .byte  0xf3,0x48,0xa5          
        subq    %rbx,%rdi
 .Lecb_out_aligned:
@@ -415,9 +428,26 @@ padlock_ecb_encrypt:
        addq    %rbx,%rsi
        subq    %rbx,%rcx
        movq    $512,%rbx
-       jnz     .Lecb_loop
-
+       jz      .Lecb_break
+       cmpq    %rbx,%rcx
+       jae     .Lecb_loop
+.Lecb_unaligned_tail:
+       xorl    %eax,%eax
        cmpq    %rsp,%rbp
+       cmoveq  %rcx,%rax
+       movq    %rdi,%r8
+       movq    %rcx,%rbx
+       subq    %rax,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       movq    %rsp,%rsi
+       movq    %r8,%rdi
+       movq    %rbx,%rcx
+       jmp     .Lecb_loop
+.p2align       4
+.Lecb_break:
+       cmpq    %rbp,%rsp
        je      .Lecb_done
 
        pxor    %xmm0,%xmm0
@@ -431,26 +461,39 @@ padlock_ecb_encrypt:
 .Lecb_done:
        leaq    (%rbp),%rsp
        jmp     .Lecb_exit
-.p2align       4
-.Lecb_short:
-       movq    %rsp,%rbp
-       subq    %rcx,%rsp
-       xorq    %rbx,%rbx
-.Lecb_short_copy:
-       movups  (%rsi,%rbx,1),%xmm0
-       leaq    16(%rbx),%rbx
-       cmpq    %rbx,%rcx
-       movaps  %xmm0,-16(%rsp,%rbx,1)
-       ja      .Lecb_short_copy
-       movq    %rsp,%rsi
-       movq    %rcx,%rbx
-       jmp     .Lecb_loop
+
 .p2align       4
 .Lecb_aligned:
+       leaq    (%rsi,%rcx,1),%rbp
+       negq    %rbp
+       andq    $4095,%rbp
+       xorl    %eax,%eax
+       cmpq    $128,%rbp
+       movq    $128-1,%rbp
+       cmovaeq %rax,%rbp
+       andq    %rcx,%rbp
+       subq    %rbp,%rcx
+       jz      .Lecb_aligned_tail
        leaq    -16(%rdx),%rax
        leaq    16(%rdx),%rbx
        shrq    $4,%rcx
 .byte  0xf3,0x0f,0xa7,200      
+       testq   %rbp,%rbp
+       jz      .Lecb_exit
+
+.Lecb_aligned_tail:
+       movq    %rdi,%r8
+       movq    %rbp,%rbx
+       movq    %rbp,%rcx
+       leaq    (%rsp),%rbp
+       subq    %rcx,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       leaq    (%r8),%rdi
+       leaq    (%rsp),%rsi
+       movq    %rbx,%rcx
+       jmp     .Lecb_loop
 .Lecb_exit:
        movl    $1,%eax
        leaq    8(%rsp),%rsp
@@ -489,8 +532,6 @@ padlock_cbc_encrypt:
        leaq    16(%rdx),%rdx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpq    $64,%rcx
-       jbe     .Lcbc_short
        testl   $32,(%rdx)
        jnz     .Lcbc_aligned
        testq   $15,%rdi
@@ -510,6 +551,21 @@ padlock_cbc_encrypt:
        negq    %rax
        andq    $512-1,%rbx
        leaq    (%rax,%rbp,1),%rsp
+       movq    $512,%rax
+       cmovzq  %rax,%rbx
+       cmpq    %rbx,%rcx
+       ja      .Lcbc_loop
+       movq    %rsi,%rax
+       cmpq    %rsp,%rbp
+       cmoveq  %rdi,%rax
+       addq    %rcx,%rax
+       negq    %rax
+       andq    $4095,%rax
+       cmpq    $64,%rax
+       movq    $-64,%rax
+       cmovaeq %rbx,%rax
+       andq    %rax,%rbx
+       jz      .Lcbc_unaligned_tail
        jmp     .Lcbc_loop
 .p2align       4
 .Lcbc_loop:
@@ -541,8 +597,8 @@ padlock_cbc_encrypt:
        testq   $15,%rdi
        jz      .Lcbc_out_aligned
        movq    %rbx,%rcx
-       shrq    $3,%rcx
        leaq    (%rsp),%rsi
+       shrq    $3,%rcx
 .byte  0xf3,0x48,0xa5          
        subq    %rbx,%rdi
 .Lcbc_out_aligned:
@@ -552,9 +608,26 @@ padlock_cbc_encrypt:
        addq    %rbx,%rsi
        subq    %rbx,%rcx
        movq    $512,%rbx
-       jnz     .Lcbc_loop
-
+       jz      .Lcbc_break
+       cmpq    %rbx,%rcx
+       jae     .Lcbc_loop
+.Lcbc_unaligned_tail:
+       xorl    %eax,%eax
        cmpq    %rsp,%rbp
+       cmoveq  %rcx,%rax
+       movq    %rdi,%r8
+       movq    %rcx,%rbx
+       subq    %rax,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       movq    %rsp,%rsi
+       movq    %r8,%rdi
+       movq    %rbx,%rcx
+       jmp     .Lcbc_loop
+.p2align       4
+.Lcbc_break:
+       cmpq    %rbp,%rsp
        je      .Lcbc_done
 
        pxor    %xmm0,%xmm0
@@ -568,28 +641,41 @@ padlock_cbc_encrypt:
 .Lcbc_done:
        leaq    (%rbp),%rsp
        jmp     .Lcbc_exit
-.p2align       4
-.Lcbc_short:
-       movq    %rsp,%rbp
-       subq    %rcx,%rsp
-       xorq    %rbx,%rbx
-.Lcbc_short_copy:
-       movups  (%rsi,%rbx,1),%xmm0
-       leaq    16(%rbx),%rbx
-       cmpq    %rbx,%rcx
-       movaps  %xmm0,-16(%rsp,%rbx,1)
-       ja      .Lcbc_short_copy
-       movq    %rsp,%rsi
-       movq    %rcx,%rbx
-       jmp     .Lcbc_loop
+
 .p2align       4
 .Lcbc_aligned:
+       leaq    (%rsi,%rcx,1),%rbp
+       negq    %rbp
+       andq    $4095,%rbp
+       xorl    %eax,%eax
+       cmpq    $64,%rbp
+       movq    $64-1,%rbp
+       cmovaeq %rax,%rbp
+       andq    %rcx,%rbp
+       subq    %rbp,%rcx
+       jz      .Lcbc_aligned_tail
        leaq    -16(%rdx),%rax
        leaq    16(%rdx),%rbx
        shrq    $4,%rcx
 .byte  0xf3,0x0f,0xa7,208      
        movdqa  (%rax),%xmm0
        movdqa  %xmm0,-16(%rdx)
+       testq   %rbp,%rbp
+       jz      .Lcbc_exit
+
+.Lcbc_aligned_tail:
+       movq    %rdi,%r8
+       movq    %rbp,%rbx
+       movq    %rbp,%rcx
+       leaq    (%rsp),%rbp
+       subq    %rcx,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       leaq    (%r8),%rdi
+       leaq    (%rsp),%rsi
+       movq    %rbx,%rcx
+       jmp     .Lcbc_loop
 .Lcbc_exit:
        movl    $1,%eax
        leaq    8(%rsp),%rsp
diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s 
b/lib/accelerated/x86/coff/padlock-x86-coff.s
index b068083..69eb468 100644
--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
@@ -180,16 +180,14 @@ _padlock_ecb_encrypt:
        leal    16(%edx),%edx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpl    $128,%ecx
-       jbe     .L006ecb_short
        testl   $32,(%edx)
-       jnz     .L007ecb_aligned
+       jnz     .L006ecb_aligned
        testl   $15,%edi
        setz    %al
        testl   $15,%esi
        setz    %bl
        testl   %ebx,%eax
-       jnz     .L007ecb_aligned
+       jnz     .L006ecb_aligned
        negl    %eax
        movl    $512,%ebx
        notl    %eax
@@ -201,10 +199,28 @@ _padlock_ecb_encrypt:
        negl    %eax
        andl    $511,%ebx
        leal    (%eax,%ebp,1),%esp
+       movl    $512,%eax
+       cmovzl  %eax,%ebx
+       movl    %ebp,%eax
+       andl    $-16,%ebp
        andl    $-16,%esp
-       jmp     .L008ecb_loop
+       movl    %eax,16(%ebp)
+       cmpl    %ebx,%ecx
+       ja      .L007ecb_loop
+       movl    %esi,%eax
+       cmpl    %esp,%ebp
+       cmovel  %edi,%eax
+       addl    %ecx,%eax
+       negl    %eax
+       andl    $4095,%eax
+       cmpl    $128,%eax
+       movl    $-128,%eax
+       cmovael %ebx,%eax
+       andl    %eax,%ebx
+       jz      .L008ecb_unaligned_tail
+       jmp     .L007ecb_loop
 .align 16
-.L008ecb_loop:
+.L007ecb_loop:
        movl    %edi,(%ebp)
        movl    %esi,4(%ebp)
        movl    %ecx,8(%ebp)
@@ -229,8 +245,8 @@ _padlock_ecb_encrypt:
        testl   $15,%edi
        jz      .L010ecb_out_aligned
        movl    %ebx,%ecx
-       shrl    $2,%ecx
        leal    (%esp),%esi
+       shrl    $2,%ecx
 .byte  243,165
        subl    %ebx,%edi
 .L010ecb_out_aligned:
@@ -240,43 +256,75 @@ _padlock_ecb_encrypt:
        addl    %ebx,%esi
        subl    %ebx,%ecx
        movl    $512,%ebx
-       jnz     .L008ecb_loop
+       jz      .L011ecb_break
+       cmpl    %ebx,%ecx
+       jae     .L007ecb_loop
+.L008ecb_unaligned_tail:
+       xorl    %eax,%eax
+       cmpl    %ebp,%esp
+       cmovel  %ecx,%eax
+       subl    %eax,%esp
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     .L007ecb_loop
+.align 16
+.L011ecb_break:
        cmpl    %ebp,%esp
-       je      .L011ecb_done
+       je      .L012ecb_done
        pxor    %xmm0,%xmm0
        leal    (%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
        movaps  %xmm0,(%eax)
        leal    16(%eax),%eax
        cmpl    %eax,%ebp
-       ja      .L012ecb_bzero
-.L011ecb_done:
+       ja      .L013ecb_bzero
+.L012ecb_done:
+       movl    16(%ebp),%ebp
        leal    24(%ebp),%esp
-       jmp     .L013ecb_exit
+       jmp     .L014ecb_exit
 .align 16
-.L006ecb_short:
+.L006ecb_aligned:
+       leal    (%esi,%ecx,1),%ebp
+       negl    %ebp
+       andl    $4095,%ebp
        xorl    %eax,%eax
-       leal    -24(%esp),%ebp
-       subl    %ecx,%eax
-       leal    (%eax,%ebp,1),%esp
-       andl    $-16,%esp
-       xorl    %ebx,%ebx
-.L014ecb_short_copy:
-       movups  (%esi,%ebx,1),%xmm0
-       leal    16(%ebx),%ebx
-       cmpl    %ebx,%ecx
-       movaps  %xmm0,-16(%esp,%ebx,1)
-       ja      .L014ecb_short_copy
-       movl    %esp,%esi
-       movl    %ecx,%ebx
-       jmp     .L008ecb_loop
-.align 16
-.L007ecb_aligned:
+       cmpl    $128,%ebp
+       movl    $127,%ebp
+       cmovael %eax,%ebp
+       andl    %ecx,%ebp
+       subl    %ebp,%ecx
+       jz      .L015ecb_aligned_tail
        leal    -16(%edx),%eax
        leal    16(%edx),%ebx
        shrl    $4,%ecx
 .byte  243,15,167,200
-.L013ecb_exit:
+       testl   %ebp,%ebp
+       jz      .L014ecb_exit
+.L015ecb_aligned_tail:
+       movl    %ebp,%ecx
+       leal    -24(%esp),%ebp
+       movl    %ebp,%esp
+       movl    %ebp,%eax
+       subl    %ecx,%esp
+       andl    $-16,%ebp
+       andl    $-16,%esp
+       movl    %eax,16(%ebp)
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     .L007ecb_loop
+.L014ecb_exit:
        movl    $1,%eax
        leal    4(%esp),%esp
 .L004ecb_abort:
@@ -299,19 +347,17 @@ _padlock_cbc_encrypt:
        movl    28(%esp),%edx
        movl    32(%esp),%ecx
        testl   $15,%edx
-       jnz     .L015cbc_abort
+       jnz     .L016cbc_abort
        testl   $15,%ecx
-       jnz     .L015cbc_abort
+       jnz     .L016cbc_abort
        leal    .Lpadlock_saved_context,%eax
        pushfl
        cld
        call    __padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
        leal    16(%edx),%edx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpl    $64,%ecx
-       jbe     .L017cbc_short
        testl   $32,(%edx)
        jnz     .L018cbc_aligned
        testl   $15,%edi
@@ -331,7 +377,25 @@ _padlock_cbc_encrypt:
        negl    %eax
        andl    $511,%ebx
        leal    (%eax,%ebp,1),%esp
+       movl    $512,%eax
+       cmovzl  %eax,%ebx
+       movl    %ebp,%eax
+       andl    $-16,%ebp
        andl    $-16,%esp
+       movl    %eax,16(%ebp)
+       cmpl    %ebx,%ecx
+       ja      .L019cbc_loop
+       movl    %esi,%eax
+       cmpl    %esp,%ebp
+       cmovel  %edi,%eax
+       addl    %ecx,%eax
+       negl    %eax
+       andl    $4095,%eax
+       cmpl    $64,%eax
+       movl    $-64,%eax
+       cmovael %ebx,%eax
+       andl    %eax,%ebx
+       jz      .L020cbc_unaligned_tail
        jmp     .L019cbc_loop
 .align 16
 .L019cbc_loop:
@@ -343,13 +407,13 @@ _padlock_cbc_encrypt:
        testl   $15,%edi
        cmovnzl %esp,%edi
        testl   $15,%esi
-       jz      .L020cbc_inp_aligned
+       jz      .L021cbc_inp_aligned
        shrl    $2,%ecx
 .byte  243,165
        subl    %ebx,%edi
        movl    %ebx,%ecx
        movl    %edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
        leal    -16(%edx),%eax
        leal    16(%edx),%ebx
        shrl    $4,%ecx
@@ -359,61 +423,93 @@ _padlock_cbc_encrypt:
        movl    (%ebp),%edi
        movl    12(%ebp),%ebx
        testl   $15,%edi
-       jz      .L021cbc_out_aligned
+       jz      .L022cbc_out_aligned
        movl    %ebx,%ecx
-       shrl    $2,%ecx
        leal    (%esp),%esi
+       shrl    $2,%ecx
 .byte  243,165
        subl    %ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
        movl    4(%ebp),%esi
        movl    8(%ebp),%ecx
        addl    %ebx,%edi
        addl    %ebx,%esi
        subl    %ebx,%ecx
        movl    $512,%ebx
-       jnz     .L019cbc_loop
+       jz      .L023cbc_break
+       cmpl    %ebx,%ecx
+       jae     .L019cbc_loop
+.L020cbc_unaligned_tail:
+       xorl    %eax,%eax
+       cmpl    %ebp,%esp
+       cmovel  %ecx,%eax
+       subl    %eax,%esp
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     .L019cbc_loop
+.align 16
+.L023cbc_break:
        cmpl    %ebp,%esp
-       je      .L022cbc_done
+       je      .L024cbc_done
        pxor    %xmm0,%xmm0
        leal    (%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
        movaps  %xmm0,(%eax)
        leal    16(%eax),%eax
        cmpl    %eax,%ebp
-       ja      .L023cbc_bzero
-.L022cbc_done:
+       ja      .L025cbc_bzero
+.L024cbc_done:
+       movl    16(%ebp),%ebp
        leal    24(%ebp),%esp
-       jmp     .L024cbc_exit
-.align 16
-.L017cbc_short:
-       xorl    %eax,%eax
-       leal    -24(%esp),%ebp
-       subl    %ecx,%eax
-       leal    (%eax,%ebp,1),%esp
-       andl    $-16,%esp
-       xorl    %ebx,%ebx
-.L025cbc_short_copy:
-       movups  (%esi,%ebx,1),%xmm0
-       leal    16(%ebx),%ebx
-       cmpl    %ebx,%ecx
-       movaps  %xmm0,-16(%esp,%ebx,1)
-       ja      .L025cbc_short_copy
-       movl    %esp,%esi
-       movl    %ecx,%ebx
-       jmp     .L019cbc_loop
+       jmp     .L026cbc_exit
 .align 16
 .L018cbc_aligned:
+       leal    (%esi,%ecx,1),%ebp
+       negl    %ebp
+       andl    $4095,%ebp
+       xorl    %eax,%eax
+       cmpl    $64,%ebp
+       movl    $63,%ebp
+       cmovael %eax,%ebp
+       andl    %ecx,%ebp
+       subl    %ebp,%ecx
+       jz      .L027cbc_aligned_tail
        leal    -16(%edx),%eax
        leal    16(%edx),%ebx
        shrl    $4,%ecx
 .byte  243,15,167,208
        movaps  (%eax),%xmm0
        movaps  %xmm0,-16(%edx)
-.L024cbc_exit:
+       testl   %ebp,%ebp
+       jz      .L026cbc_exit
+.L027cbc_aligned_tail:
+       movl    %ebp,%ecx
+       leal    -24(%esp),%ebp
+       movl    %ebp,%esp
+       movl    %ebp,%eax
+       subl    %ecx,%esp
+       andl    $-16,%ebp
+       andl    $-16,%esp
+       movl    %eax,16(%ebp)
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     .L019cbc_loop
+.L026cbc_exit:
        movl    $1,%eax
        leal    4(%esp),%esp
-.L015cbc_abort:
+.L016cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -437,10 +533,10 @@ __win32_segv_handler:
        movl    4(%esp),%edx
        movl    12(%esp),%ecx
        cmpl    $3221225477,(%edx)
-       jne     .L026ret
+       jne     .L028ret
        addl    $4,184(%ecx)
        movl    $0,%eax
-.L026ret:
+.L028ret:
        ret
 .globl _padlock_sha1_oneshot
 .def   _padlock_sha1_oneshot;  .scl    2;      .type   32;     .endef
diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s 
b/lib/accelerated/x86/elf/padlock-x86-64.s
index bf5f626..4709ac2 100644
--- a/lib/accelerated/x86/elf/padlock-x86-64.s
+++ b/lib/accelerated/x86/elf/padlock-x86-64.s
@@ -276,8 +276,6 @@ padlock_ecb_encrypt:
        leaq    16(%rdx),%rdx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpq    $128,%rcx
-       jbe     .Lecb_short
        testl   $32,(%rdx)
        jnz     .Lecb_aligned
        testq   $15,%rdi
@@ -297,6 +295,21 @@ padlock_ecb_encrypt:
        negq    %rax
        andq    $512-1,%rbx
        leaq    (%rax,%rbp,1),%rsp
+       movq    $512,%rax
+       cmovzq  %rax,%rbx
+       cmpq    %rbx,%rcx
+       ja      .Lecb_loop
+       movq    %rsi,%rax
+       cmpq    %rsp,%rbp
+       cmoveq  %rdi,%rax
+       addq    %rcx,%rax
+       negq    %rax
+       andq    $4095,%rax
+       cmpq    $128,%rax
+       movq    $-128,%rax
+       cmovaeq %rbx,%rax
+       andq    %rax,%rbx
+       jz      .Lecb_unaligned_tail
        jmp     .Lecb_loop
 .align 16
 .Lecb_loop:
@@ -326,8 +339,8 @@ padlock_ecb_encrypt:
        testq   $15,%rdi
        jz      .Lecb_out_aligned
        movq    %rbx,%rcx
-       shrq    $3,%rcx
        leaq    (%rsp),%rsi
+       shrq    $3,%rcx
 .byte  0xf3,0x48,0xa5          
        subq    %rbx,%rdi
 .Lecb_out_aligned:
@@ -337,9 +350,26 @@ padlock_ecb_encrypt:
        addq    %rbx,%rsi
        subq    %rbx,%rcx
        movq    $512,%rbx
-       jnz     .Lecb_loop
-
+       jz      .Lecb_break
+       cmpq    %rbx,%rcx
+       jae     .Lecb_loop
+.Lecb_unaligned_tail:
+       xorl    %eax,%eax
        cmpq    %rsp,%rbp
+       cmoveq  %rcx,%rax
+       movq    %rdi,%r8
+       movq    %rcx,%rbx
+       subq    %rax,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       movq    %rsp,%rsi
+       movq    %r8,%rdi
+       movq    %rbx,%rcx
+       jmp     .Lecb_loop
+.align 16
+.Lecb_break:
+       cmpq    %rbp,%rsp
        je      .Lecb_done
 
        pxor    %xmm0,%xmm0
@@ -353,26 +383,39 @@ padlock_ecb_encrypt:
 .Lecb_done:
        leaq    (%rbp),%rsp
        jmp     .Lecb_exit
-.align 16
-.Lecb_short:
-       movq    %rsp,%rbp
-       subq    %rcx,%rsp
-       xorq    %rbx,%rbx
-.Lecb_short_copy:
-       movups  (%rsi,%rbx,1),%xmm0
-       leaq    16(%rbx),%rbx
-       cmpq    %rbx,%rcx
-       movaps  %xmm0,-16(%rsp,%rbx,1)
-       ja      .Lecb_short_copy
-       movq    %rsp,%rsi
-       movq    %rcx,%rbx
-       jmp     .Lecb_loop
+
 .align 16
 .Lecb_aligned:
+       leaq    (%rsi,%rcx,1),%rbp
+       negq    %rbp
+       andq    $4095,%rbp
+       xorl    %eax,%eax
+       cmpq    $128,%rbp
+       movq    $128-1,%rbp
+       cmovaeq %rax,%rbp
+       andq    %rcx,%rbp
+       subq    %rbp,%rcx
+       jz      .Lecb_aligned_tail
        leaq    -16(%rdx),%rax
        leaq    16(%rdx),%rbx
        shrq    $4,%rcx
 .byte  0xf3,0x0f,0xa7,200      
+       testq   %rbp,%rbp
+       jz      .Lecb_exit
+
+.Lecb_aligned_tail:
+       movq    %rdi,%r8
+       movq    %rbp,%rbx
+       movq    %rbp,%rcx
+       leaq    (%rsp),%rbp
+       subq    %rcx,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       leaq    (%r8),%rdi
+       leaq    (%rsp),%rsi
+       movq    %rbx,%rcx
+       jmp     .Lecb_loop
 .Lecb_exit:
        movl    $1,%eax
        leaq    8(%rsp),%rsp
@@ -400,8 +443,6 @@ padlock_cbc_encrypt:
        leaq    16(%rdx),%rdx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpq    $64,%rcx
-       jbe     .Lcbc_short
        testl   $32,(%rdx)
        jnz     .Lcbc_aligned
        testq   $15,%rdi
@@ -421,6 +462,21 @@ padlock_cbc_encrypt:
        negq    %rax
        andq    $512-1,%rbx
        leaq    (%rax,%rbp,1),%rsp
+       movq    $512,%rax
+       cmovzq  %rax,%rbx
+       cmpq    %rbx,%rcx
+       ja      .Lcbc_loop
+       movq    %rsi,%rax
+       cmpq    %rsp,%rbp
+       cmoveq  %rdi,%rax
+       addq    %rcx,%rax
+       negq    %rax
+       andq    $4095,%rax
+       cmpq    $64,%rax
+       movq    $-64,%rax
+       cmovaeq %rbx,%rax
+       andq    %rax,%rbx
+       jz      .Lcbc_unaligned_tail
        jmp     .Lcbc_loop
 .align 16
 .Lcbc_loop:
@@ -452,8 +508,8 @@ padlock_cbc_encrypt:
        testq   $15,%rdi
        jz      .Lcbc_out_aligned
        movq    %rbx,%rcx
-       shrq    $3,%rcx
        leaq    (%rsp),%rsi
+       shrq    $3,%rcx
 .byte  0xf3,0x48,0xa5          
        subq    %rbx,%rdi
 .Lcbc_out_aligned:
@@ -463,9 +519,26 @@ padlock_cbc_encrypt:
        addq    %rbx,%rsi
        subq    %rbx,%rcx
        movq    $512,%rbx
-       jnz     .Lcbc_loop
-
+       jz      .Lcbc_break
+       cmpq    %rbx,%rcx
+       jae     .Lcbc_loop
+.Lcbc_unaligned_tail:
+       xorl    %eax,%eax
        cmpq    %rsp,%rbp
+       cmoveq  %rcx,%rax
+       movq    %rdi,%r8
+       movq    %rcx,%rbx
+       subq    %rax,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       movq    %rsp,%rsi
+       movq    %r8,%rdi
+       movq    %rbx,%rcx
+       jmp     .Lcbc_loop
+.align 16
+.Lcbc_break:
+       cmpq    %rbp,%rsp
        je      .Lcbc_done
 
        pxor    %xmm0,%xmm0
@@ -479,28 +552,41 @@ padlock_cbc_encrypt:
 .Lcbc_done:
        leaq    (%rbp),%rsp
        jmp     .Lcbc_exit
-.align 16
-.Lcbc_short:
-       movq    %rsp,%rbp
-       subq    %rcx,%rsp
-       xorq    %rbx,%rbx
-.Lcbc_short_copy:
-       movups  (%rsi,%rbx,1),%xmm0
-       leaq    16(%rbx),%rbx
-       cmpq    %rbx,%rcx
-       movaps  %xmm0,-16(%rsp,%rbx,1)
-       ja      .Lcbc_short_copy
-       movq    %rsp,%rsi
-       movq    %rcx,%rbx
-       jmp     .Lcbc_loop
+
 .align 16
 .Lcbc_aligned:
+       leaq    (%rsi,%rcx,1),%rbp
+       negq    %rbp
+       andq    $4095,%rbp
+       xorl    %eax,%eax
+       cmpq    $64,%rbp
+       movq    $64-1,%rbp
+       cmovaeq %rax,%rbp
+       andq    %rcx,%rbp
+       subq    %rbp,%rcx
+       jz      .Lcbc_aligned_tail
        leaq    -16(%rdx),%rax
        leaq    16(%rdx),%rbx
        shrq    $4,%rcx
 .byte  0xf3,0x0f,0xa7,208      
        movdqa  (%rax),%xmm0
        movdqa  %xmm0,-16(%rdx)
+       testq   %rbp,%rbp
+       jz      .Lcbc_exit
+
+.Lcbc_aligned_tail:
+       movq    %rdi,%r8
+       movq    %rbp,%rbx
+       movq    %rbp,%rcx
+       leaq    (%rsp),%rbp
+       subq    %rcx,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       leaq    (%r8),%rdi
+       leaq    (%rsp),%rsi
+       movq    %rbx,%rcx
+       jmp     .Lcbc_loop
 .Lcbc_exit:
        movl    $1,%eax
        leaq    8(%rsp),%rsp
diff --git a/lib/accelerated/x86/license.txt b/lib/accelerated/x86/license.txt
index c87ba42..929ddd5 100755
--- a/lib/accelerated/x86/license.txt
+++ b/lib/accelerated/x86/license.txt
@@ -5,7 +5,7 @@ CRYPTOGAMS licenses depending on where you obtain it. For 
further
 details see http://www.openssl.org/~appro/cryptogams/.
 ====================================================================
 
-Copyright (c) 2006, CRYPTOGAMS by <address@hidden>
+Copyright (c) 2006-2012, CRYPTOGAMS by <address@hidden>
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s 
b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
index 9b912f9..dbd89da 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
@@ -276,8 +276,6 @@ _padlock_ecb_encrypt:
        leaq    16(%rdx),%rdx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpq    $128,%rcx
-       jbe     L$ecb_short
        testl   $32,(%rdx)
        jnz     L$ecb_aligned
        testq   $15,%rdi
@@ -297,6 +295,21 @@ _padlock_ecb_encrypt:
        negq    %rax
        andq    $512-1,%rbx
        leaq    (%rax,%rbp,1),%rsp
+       movq    $512,%rax
+       cmovzq  %rax,%rbx
+       cmpq    %rbx,%rcx
+       ja      L$ecb_loop
+       movq    %rsi,%rax
+       cmpq    %rsp,%rbp
+       cmoveq  %rdi,%rax
+       addq    %rcx,%rax
+       negq    %rax
+       andq    $4095,%rax
+       cmpq    $128,%rax
+       movq    $-128,%rax
+       cmovaeq %rbx,%rax
+       andq    %rax,%rbx
+       jz      L$ecb_unaligned_tail
        jmp     L$ecb_loop
 .p2align       4
 L$ecb_loop:
@@ -326,8 +339,8 @@ L$ecb_inp_aligned:
        testq   $15,%rdi
        jz      L$ecb_out_aligned
        movq    %rbx,%rcx
-       shrq    $3,%rcx
        leaq    (%rsp),%rsi
+       shrq    $3,%rcx
 .byte  0xf3,0x48,0xa5          
        subq    %rbx,%rdi
 L$ecb_out_aligned:
@@ -337,9 +350,26 @@ L$ecb_out_aligned:
        addq    %rbx,%rsi
        subq    %rbx,%rcx
        movq    $512,%rbx
-       jnz     L$ecb_loop
-
+       jz      L$ecb_break
+       cmpq    %rbx,%rcx
+       jae     L$ecb_loop
+L$ecb_unaligned_tail:
+       xorl    %eax,%eax
        cmpq    %rsp,%rbp
+       cmoveq  %rcx,%rax
+       movq    %rdi,%r8
+       movq    %rcx,%rbx
+       subq    %rax,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       movq    %rsp,%rsi
+       movq    %r8,%rdi
+       movq    %rbx,%rcx
+       jmp     L$ecb_loop
+.p2align       4
+L$ecb_break:
+       cmpq    %rbp,%rsp
        je      L$ecb_done
 
        pxor    %xmm0,%xmm0
@@ -353,26 +383,39 @@ L$ecb_bzero:
 L$ecb_done:
        leaq    (%rbp),%rsp
        jmp     L$ecb_exit
-.p2align       4
-L$ecb_short:
-       movq    %rsp,%rbp
-       subq    %rcx,%rsp
-       xorq    %rbx,%rbx
-L$ecb_short_copy:
-       movups  (%rsi,%rbx,1),%xmm0
-       leaq    16(%rbx),%rbx
-       cmpq    %rbx,%rcx
-       movaps  %xmm0,-16(%rsp,%rbx,1)
-       ja      L$ecb_short_copy
-       movq    %rsp,%rsi
-       movq    %rcx,%rbx
-       jmp     L$ecb_loop
+
 .p2align       4
 L$ecb_aligned:
+       leaq    (%rsi,%rcx,1),%rbp
+       negq    %rbp
+       andq    $4095,%rbp
+       xorl    %eax,%eax
+       cmpq    $128,%rbp
+       movq    $128-1,%rbp
+       cmovaeq %rax,%rbp
+       andq    %rcx,%rbp
+       subq    %rbp,%rcx
+       jz      L$ecb_aligned_tail
        leaq    -16(%rdx),%rax
        leaq    16(%rdx),%rbx
        shrq    $4,%rcx
 .byte  0xf3,0x0f,0xa7,200      
+       testq   %rbp,%rbp
+       jz      L$ecb_exit
+
+L$ecb_aligned_tail:
+       movq    %rdi,%r8
+       movq    %rbp,%rbx
+       movq    %rbp,%rcx
+       leaq    (%rsp),%rbp
+       subq    %rcx,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       leaq    (%r8),%rdi
+       leaq    (%rsp),%rsi
+       movq    %rbx,%rcx
+       jmp     L$ecb_loop
 L$ecb_exit:
        movl    $1,%eax
        leaq    8(%rsp),%rsp
@@ -400,8 +443,6 @@ _padlock_cbc_encrypt:
        leaq    16(%rdx),%rdx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpq    $64,%rcx
-       jbe     L$cbc_short
        testl   $32,(%rdx)
        jnz     L$cbc_aligned
        testq   $15,%rdi
@@ -421,6 +462,21 @@ _padlock_cbc_encrypt:
        negq    %rax
        andq    $512-1,%rbx
        leaq    (%rax,%rbp,1),%rsp
+       movq    $512,%rax
+       cmovzq  %rax,%rbx
+       cmpq    %rbx,%rcx
+       ja      L$cbc_loop
+       movq    %rsi,%rax
+       cmpq    %rsp,%rbp
+       cmoveq  %rdi,%rax
+       addq    %rcx,%rax
+       negq    %rax
+       andq    $4095,%rax
+       cmpq    $64,%rax
+       movq    $-64,%rax
+       cmovaeq %rbx,%rax
+       andq    %rax,%rbx
+       jz      L$cbc_unaligned_tail
        jmp     L$cbc_loop
 .p2align       4
 L$cbc_loop:
@@ -452,8 +508,8 @@ L$cbc_inp_aligned:
        testq   $15,%rdi
        jz      L$cbc_out_aligned
        movq    %rbx,%rcx
-       shrq    $3,%rcx
        leaq    (%rsp),%rsi
+       shrq    $3,%rcx
 .byte  0xf3,0x48,0xa5          
        subq    %rbx,%rdi
 L$cbc_out_aligned:
@@ -463,9 +519,26 @@ L$cbc_out_aligned:
        addq    %rbx,%rsi
        subq    %rbx,%rcx
        movq    $512,%rbx
-       jnz     L$cbc_loop
-
+       jz      L$cbc_break
+       cmpq    %rbx,%rcx
+       jae     L$cbc_loop
+L$cbc_unaligned_tail:
+       xorl    %eax,%eax
        cmpq    %rsp,%rbp
+       cmoveq  %rcx,%rax
+       movq    %rdi,%r8
+       movq    %rcx,%rbx
+       subq    %rax,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       movq    %rsp,%rsi
+       movq    %r8,%rdi
+       movq    %rbx,%rcx
+       jmp     L$cbc_loop
+.p2align       4
+L$cbc_break:
+       cmpq    %rbp,%rsp
        je      L$cbc_done
 
        pxor    %xmm0,%xmm0
@@ -479,28 +552,41 @@ L$cbc_bzero:
 L$cbc_done:
        leaq    (%rbp),%rsp
        jmp     L$cbc_exit
-.p2align       4
-L$cbc_short:
-       movq    %rsp,%rbp
-       subq    %rcx,%rsp
-       xorq    %rbx,%rbx
-L$cbc_short_copy:
-       movups  (%rsi,%rbx,1),%xmm0
-       leaq    16(%rbx),%rbx
-       cmpq    %rbx,%rcx
-       movaps  %xmm0,-16(%rsp,%rbx,1)
-       ja      L$cbc_short_copy
-       movq    %rsp,%rsi
-       movq    %rcx,%rbx
-       jmp     L$cbc_loop
+
 .p2align       4
 L$cbc_aligned:
+       leaq    (%rsi,%rcx,1),%rbp
+       negq    %rbp
+       andq    $4095,%rbp
+       xorl    %eax,%eax
+       cmpq    $64,%rbp
+       movq    $64-1,%rbp
+       cmovaeq %rax,%rbp
+       andq    %rcx,%rbp
+       subq    %rbp,%rcx
+       jz      L$cbc_aligned_tail
        leaq    -16(%rdx),%rax
        leaq    16(%rdx),%rbx
        shrq    $4,%rcx
 .byte  0xf3,0x0f,0xa7,208      
        movdqa  (%rax),%xmm0
        movdqa  %xmm0,-16(%rdx)
+       testq   %rbp,%rbp
+       jz      L$cbc_exit
+
+L$cbc_aligned_tail:
+       movq    %rdi,%r8
+       movq    %rbp,%rbx
+       movq    %rbp,%rcx
+       leaq    (%rsp),%rbp
+       subq    %rcx,%rsp
+       shrq    $3,%rcx
+       leaq    (%rsp),%rdi
+.byte  0xf3,0x48,0xa5          
+       leaq    (%r8),%rdi
+       leaq    (%rsp),%rsi
+       movq    %rbx,%rcx
+       jmp     L$cbc_loop
 L$cbc_exit:
        movl    $1,%eax
        leaq    8(%rsp),%rsp
diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s 
b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
index 02b427e..40cfce9 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
@@ -174,16 +174,14 @@ L005ecb_pic_point:
        leal    16(%edx),%edx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpl    $128,%ecx
-       jbe     L006ecb_short
        testl   $32,(%edx)
-       jnz     L007ecb_aligned
+       jnz     L006ecb_aligned
        testl   $15,%edi
        setz    %al
        testl   $15,%esi
        setz    %bl
        testl   %ebx,%eax
-       jnz     L007ecb_aligned
+       jnz     L006ecb_aligned
        negl    %eax
        movl    $512,%ebx
        notl    %eax
@@ -195,10 +193,28 @@ L005ecb_pic_point:
        negl    %eax
        andl    $511,%ebx
        leal    (%eax,%ebp,1),%esp
+       movl    $512,%eax
+       cmovzl  %eax,%ebx
+       movl    %ebp,%eax
+       andl    $-16,%ebp
        andl    $-16,%esp
-       jmp     L008ecb_loop
+       movl    %eax,16(%ebp)
+       cmpl    %ebx,%ecx
+       ja      L007ecb_loop
+       movl    %esi,%eax
+       cmpl    %esp,%ebp
+       cmovel  %edi,%eax
+       addl    %ecx,%eax
+       negl    %eax
+       andl    $4095,%eax
+       cmpl    $128,%eax
+       movl    $-128,%eax
+       cmovael %ebx,%eax
+       andl    %eax,%ebx
+       jz      L008ecb_unaligned_tail
+       jmp     L007ecb_loop
 .align 4,0x90
-L008ecb_loop:
+L007ecb_loop:
        movl    %edi,(%ebp)
        movl    %esi,4(%ebp)
        movl    %ecx,8(%ebp)
@@ -223,8 +239,8 @@ L009ecb_inp_aligned:
        testl   $15,%edi
        jz      L010ecb_out_aligned
        movl    %ebx,%ecx
-       shrl    $2,%ecx
        leal    (%esp),%esi
+       shrl    $2,%ecx
 .byte  243,165
        subl    %ebx,%edi
 L010ecb_out_aligned:
@@ -234,43 +250,75 @@ L010ecb_out_aligned:
        addl    %ebx,%esi
        subl    %ebx,%ecx
        movl    $512,%ebx
-       jnz     L008ecb_loop
+       jz      L011ecb_break
+       cmpl    %ebx,%ecx
+       jae     L007ecb_loop
+L008ecb_unaligned_tail:
+       xorl    %eax,%eax
+       cmpl    %ebp,%esp
+       cmovel  %ecx,%eax
+       subl    %eax,%esp
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     L007ecb_loop
+.align 4,0x90
+L011ecb_break:
        cmpl    %ebp,%esp
-       je      L011ecb_done
+       je      L012ecb_done
        pxor    %xmm0,%xmm0
        leal    (%esp),%eax
-L012ecb_bzero:
+L013ecb_bzero:
        movaps  %xmm0,(%eax)
        leal    16(%eax),%eax
        cmpl    %eax,%ebp
-       ja      L012ecb_bzero
-L011ecb_done:
+       ja      L013ecb_bzero
+L012ecb_done:
+       movl    16(%ebp),%ebp
        leal    24(%ebp),%esp
-       jmp     L013ecb_exit
+       jmp     L014ecb_exit
 .align 4,0x90
-L006ecb_short:
+L006ecb_aligned:
+       leal    (%esi,%ecx,1),%ebp
+       negl    %ebp
+       andl    $4095,%ebp
        xorl    %eax,%eax
-       leal    -24(%esp),%ebp
-       subl    %ecx,%eax
-       leal    (%eax,%ebp,1),%esp
-       andl    $-16,%esp
-       xorl    %ebx,%ebx
-L014ecb_short_copy:
-       movups  (%esi,%ebx,1),%xmm0
-       leal    16(%ebx),%ebx
-       cmpl    %ebx,%ecx
-       movaps  %xmm0,-16(%esp,%ebx,1)
-       ja      L014ecb_short_copy
-       movl    %esp,%esi
-       movl    %ecx,%ebx
-       jmp     L008ecb_loop
-.align 4,0x90
-L007ecb_aligned:
+       cmpl    $128,%ebp
+       movl    $127,%ebp
+       cmovael %eax,%ebp
+       andl    %ecx,%ebp
+       subl    %ebp,%ecx
+       jz      L015ecb_aligned_tail
        leal    -16(%edx),%eax
        leal    16(%edx),%ebx
        shrl    $4,%ecx
 .byte  243,15,167,200
-L013ecb_exit:
+       testl   %ebp,%ebp
+       jz      L014ecb_exit
+L015ecb_aligned_tail:
+       movl    %ebp,%ecx
+       leal    -24(%esp),%ebp
+       movl    %ebp,%esp
+       movl    %ebp,%eax
+       subl    %ecx,%esp
+       andl    $-16,%ebp
+       andl    $-16,%esp
+       movl    %eax,16(%ebp)
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     L007ecb_loop
+L014ecb_exit:
        movl    $1,%eax
        leal    4(%esp),%esp
 L004ecb_abort:
@@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin:
        movl    28(%esp),%edx
        movl    32(%esp),%ecx
        testl   $15,%edx
-       jnz     L015cbc_abort
+       jnz     L016cbc_abort
        testl   $15,%ecx
-       jnz     L015cbc_abort
-       leal    Lpadlock_saved_context-L016cbc_pic_point,%eax
+       jnz     L016cbc_abort
+       leal    Lpadlock_saved_context-L017cbc_pic_point,%eax
        pushfl
        cld
        call    __padlock_verify_ctx
-L016cbc_pic_point:
+L017cbc_pic_point:
        leal    16(%edx),%edx
        xorl    %eax,%eax
        xorl    %ebx,%ebx
-       cmpl    $64,%ecx
-       jbe     L017cbc_short
        testl   $32,(%edx)
        jnz     L018cbc_aligned
        testl   $15,%edi
@@ -324,7 +370,25 @@ L016cbc_pic_point:
        negl    %eax
        andl    $511,%ebx
        leal    (%eax,%ebp,1),%esp
+       movl    $512,%eax
+       cmovzl  %eax,%ebx
+       movl    %ebp,%eax
+       andl    $-16,%ebp
        andl    $-16,%esp
+       movl    %eax,16(%ebp)
+       cmpl    %ebx,%ecx
+       ja      L019cbc_loop
+       movl    %esi,%eax
+       cmpl    %esp,%ebp
+       cmovel  %edi,%eax
+       addl    %ecx,%eax
+       negl    %eax
+       andl    $4095,%eax
+       cmpl    $64,%eax
+       movl    $-64,%eax
+       cmovael %ebx,%eax
+       andl    %eax,%ebx
+       jz      L020cbc_unaligned_tail
        jmp     L019cbc_loop
 .align 4,0x90
 L019cbc_loop:
@@ -336,13 +400,13 @@ L019cbc_loop:
        testl   $15,%edi
        cmovnzl %esp,%edi
        testl   $15,%esi
-       jz      L020cbc_inp_aligned
+       jz      L021cbc_inp_aligned
        shrl    $2,%ecx
 .byte  243,165
        subl    %ebx,%edi
        movl    %ebx,%ecx
        movl    %edi,%esi
-L020cbc_inp_aligned:
+L021cbc_inp_aligned:
        leal    -16(%edx),%eax
        leal    16(%edx),%ebx
        shrl    $4,%ecx
@@ -352,61 +416,93 @@ L020cbc_inp_aligned:
        movl    (%ebp),%edi
        movl    12(%ebp),%ebx
        testl   $15,%edi
-       jz      L021cbc_out_aligned
+       jz      L022cbc_out_aligned
        movl    %ebx,%ecx
-       shrl    $2,%ecx
        leal    (%esp),%esi
+       shrl    $2,%ecx
 .byte  243,165
        subl    %ebx,%edi
-L021cbc_out_aligned:
+L022cbc_out_aligned:
        movl    4(%ebp),%esi
        movl    8(%ebp),%ecx
        addl    %ebx,%edi
        addl    %ebx,%esi
        subl    %ebx,%ecx
        movl    $512,%ebx
-       jnz     L019cbc_loop
+       jz      L023cbc_break
+       cmpl    %ebx,%ecx
+       jae     L019cbc_loop
+L020cbc_unaligned_tail:
+       xorl    %eax,%eax
+       cmpl    %ebp,%esp
+       cmovel  %ecx,%eax
+       subl    %eax,%esp
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     L019cbc_loop
+.align 4,0x90
+L023cbc_break:
        cmpl    %ebp,%esp
-       je      L022cbc_done
+       je      L024cbc_done
        pxor    %xmm0,%xmm0
        leal    (%esp),%eax
-L023cbc_bzero:
+L025cbc_bzero:
        movaps  %xmm0,(%eax)
        leal    16(%eax),%eax
        cmpl    %eax,%ebp
-       ja      L023cbc_bzero
-L022cbc_done:
+       ja      L025cbc_bzero
+L024cbc_done:
+       movl    16(%ebp),%ebp
        leal    24(%ebp),%esp
-       jmp     L024cbc_exit
-.align 4,0x90
-L017cbc_short:
-       xorl    %eax,%eax
-       leal    -24(%esp),%ebp
-       subl    %ecx,%eax
-       leal    (%eax,%ebp,1),%esp
-       andl    $-16,%esp
-       xorl    %ebx,%ebx
-L025cbc_short_copy:
-       movups  (%esi,%ebx,1),%xmm0
-       leal    16(%ebx),%ebx
-       cmpl    %ebx,%ecx
-       movaps  %xmm0,-16(%esp,%ebx,1)
-       ja      L025cbc_short_copy
-       movl    %esp,%esi
-       movl    %ecx,%ebx
-       jmp     L019cbc_loop
+       jmp     L026cbc_exit
 .align 4,0x90
 L018cbc_aligned:
+       leal    (%esi,%ecx,1),%ebp
+       negl    %ebp
+       andl    $4095,%ebp
+       xorl    %eax,%eax
+       cmpl    $64,%ebp
+       movl    $63,%ebp
+       cmovael %eax,%ebp
+       andl    %ecx,%ebp
+       subl    %ebp,%ecx
+       jz      L027cbc_aligned_tail
        leal    -16(%edx),%eax
        leal    16(%edx),%ebx
        shrl    $4,%ecx
 .byte  243,15,167,208
        movaps  (%eax),%xmm0
        movaps  %xmm0,-16(%edx)
-L024cbc_exit:
+       testl   %ebp,%ebp
+       jz      L026cbc_exit
+L027cbc_aligned_tail:
+       movl    %ebp,%ecx
+       leal    -24(%esp),%ebp
+       movl    %ebp,%esp
+       movl    %ebp,%eax
+       subl    %ecx,%esp
+       andl    $-16,%ebp
+       andl    $-16,%esp
+       movl    %eax,16(%ebp)
+       movl    %edi,%eax
+       movl    %ecx,%ebx
+       shrl    $2,%ecx
+       leal    (%esp),%edi
+.byte  243,165
+       movl    %esp,%esi
+       movl    %eax,%edi
+       movl    %ebx,%ecx
+       jmp     L019cbc_loop
+L026cbc_exit:
        movl    $1,%eax
        leal    4(%esp),%esp
-L015cbc_abort:
+L016cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -428,10 +524,10 @@ __win32_segv_handler:
        movl    4(%esp),%edx
        movl    12(%esp),%ecx
        cmpl    $3221225477,(%edx)
-       jne     L026ret
+       jne     L028ret
        addl    $4,184(%ecx)
        movl    $0,%eax
-L026ret:
+L028ret:
        ret
 .globl _padlock_sha1_oneshot
 .align 4


hooks/post-receive
-- 
GNU gnutls



reply via email to

[Prev in Thread] Current Thread [Next in Thread]