$ cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 42 model name : Genuine Intel(R) CPU 0 @ 2.50GHz
stepping : 5 cpu MHz : 800.000 cache size : 3072 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 2 apicid : 0 initial apicid : 0 fpu : yes
fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips : 4988.10 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management:
You can see that it is working in 64 mode.
Checkout coreutils git lastest version and compile, here is original speed:
$ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.71user 0.20system 0:03.92elapsed 99%CPU (0avgtext+0avgdata 2528maxresident)k 0inputs+0outputs (0major+195minor)pagefaults 0swaps
Using -O3 to compile: (make CFLAGS="-g -O3") instead of default -O2: $ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.47user 0.14system 0:03.62elapsed 99%CPU (0avgtext+0avgdata 2512maxresident)k
0inputs+0outputs (0major+195minor)pagefaults 0swaps
Just changing SWAP to ntohl does have a significant effect on my platform (the bswap instruction is used). According to Linus comment, this should be done only in platform having fast unalinged load:
$ diff -uN lib/sha1.c.orig lib/sha1.c --- lib/sha1.c.orig 2011-09-05 23:31:23.128552012 +0200 +++ lib/sha1.c 2011-09-05 23:29:58.988552016 +0200 @@ -29,6 +29,7 @@ #include <stddef.h> #include <stdlib.h>
#include <string.h> +#include <arpa/inet.h>
#if USE_UNLOCKED_IO # include "unlocked-io.h" @@ -333,7 +334,7 @@ int t; for (t = 0; t < 16; t++) {
- x[t] = SWAP (*words); + x[t] = ntohl(*words); words++; }
-#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ +#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); x[I&0x0f] = TEMP;\ + E += rol( A, 5 ) \ + F( B, C, D ) \
+ K \ - + M; \ + + TEMP; \ B = rol( B, 30 ); \
} while(0)
+ while (words < endp) { - uint32_t tm; - int t; - for (t = 0; t < 16; t++) - { - x[t] = ntohl(*words);
- words++; - } - - R( a, b, c, d, e, F1, K1, x[ 0] ); - R( e, a, b, c, d, F1, K1, x[ 1] ); - R( d, e, a, b, c, F1, K1, x[ 2] ); - R( c, d, e, a, b, F1, K1, x[ 3] );
- R( b, c, d, e, a, F1, K1, x[ 4] ); - R( a, b, c, d, e, F1, K1, x[ 5] ); - R( e, a, b, c, d, F1, K1, x[ 6] ); - R( d, e, a, b, c, F1, K1, x[ 7] ); - R( c, d, e, a, b, F1, K1, x[ 8] );
- R( b, c, d, e, a, F1, K1, x[ 9] ); - R( a, b, c, d, e, F1, K1, x[10] ); - R( e, a, b, c, d, F1, K1, x[11] ); - R( d, e, a, b, c, F1, K1, x[12] ); - R( c, d, e, a, b, F1, K1, x[13] );
- R( b, c, d, e, a, F1, K1, x[14] ); - R( a, b, c, d, e, F1, K1, x[15] ); - R( e, a, b, c, d, F1, K1, M(16) ); - R( d, e, a, b, c, F1, K1, M(17) ); - R( c, d, e, a, b, F1, K1, M(18) );
- R( b, c, d, e, a, F1, K1, M(19) ); - R( a, b, c, d, e, F2, K2, M(20) ); - R( e, a, b, c, d, F2, K2, M(21) ); - R( d, e, a, b, c, F2, K2, M(22) ); - R( c, d, e, a, b, F2, K2, M(23) );
- R( b, c, d, e, a, F2, K2, M(24) ); - R( a, b, c, d, e, F2, K2, M(25) ); - R( e, a, b, c, d, F2, K2, M(26) ); - R( d, e, a, b, c, F2, K2, M(27) ); - R( c, d, e, a, b, F2, K2, M(28) );
- R( b, c, d, e, a, F2, K2, M(29) ); - R( a, b, c, d, e, F2, K2, M(30) ); - R( e, a, b, c, d, F2, K2, M(31) ); - R( d, e, a, b, c, F2, K2, M(32) ); - R( c, d, e, a, b, F2, K2, M(33) );
- R( b, c, d, e, a, F2, K2, M(34) ); - R( a, b, c, d, e, F2, K2, M(35) ); - R( e, a, b, c, d, F2, K2, M(36) ); - R( d, e, a, b, c, F2, K2, M(37) ); - R( c, d, e, a, b, F2, K2, M(38) );
- R( b, c, d, e, a, F2, K2, M(39) ); - R( a, b, c, d, e, F3, K3, M(40) ); - R( e, a, b, c, d, F3, K3, M(41) ); - R( d, e, a, b, c, F3, K3, M(42) ); - R( c, d, e, a, b, F3, K3, M(43) );
- R( b, c, d, e, a, F3, K3, M(44) ); - R( a, b, c, d, e, F3, K3, M(45) ); - R( e, a, b, c, d, F3, K3, M(46) ); - R( d, e, a, b, c, F3, K3, M(47) ); - R( c, d, e, a, b, F3, K3, M(48) );
- R( b, c, d, e, a, F3, K3, M(49) ); - R( a, b, c, d, e, F3, K3, M(50) ); - R( e, a, b, c, d, F3, K3, M(51) ); - R( d, e, a, b, c, F3, K3, M(52) ); - R( c, d, e, a, b, F3, K3, M(53) );
- R( b, c, d, e, a, F3, K3, M(54) ); - R( a, b, c, d, e, F3, K3, M(55) ); - R( e, a, b, c, d, F3, K3, M(56) ); - R( d, e, a, b, c, F3, K3, M(57) ); - R( c, d, e, a, b, F3, K3, M(58) );
- R( b, c, d, e, a, F3, K3, M(59) ); - R( a, b, c, d, e, F4, K4, M(60) ); - R( e, a, b, c, d, F4, K4, M(61) ); - R( d, e, a, b, c, F4, K4, M(62) ); - R( c, d, e, a, b, F4, K4, M(63) );
- R( b, c, d, e, a, F4, K4, M(64) ); - R( a, b, c, d, e, F4, K4, M(65) ); - R( e, a, b, c, d, F4, K4, M(66) ); - R( d, e, a, b, c, F4, K4, M(67) ); - R( c, d, e, a, b, F4, K4, M(68) );
- R( b, c, d, e, a, F4, K4, M(69) ); - R( a, b, c, d, e, F4, K4, M(70) ); - R( e, a, b, c, d, F4, K4, M(71) ); - R( d, e, a, b, c, F4, K4, M(72) ); - R( c, d, e, a, b, F4, K4, M(73) );
- R( b, c, d, e, a, F4, K4, M(74) ); - R( a, b, c, d, e, F4, K4, M(75) ); - R( e, a, b, c, d, F4, K4, M(76) ); - R( d, e, a, b, c, F4, K4, M(77) ); - R( c, d, e, a, b, F4, K4, M(78) );
- R( b, c, d, e, a, F4, K4, M(79) ); + R( a, b, c, d, e, F1, K1, X, 0); + R( e, a, b, c, d, F1, K1, X, 1); + R( d, e, a, b, c, F1, K1, X, 2); + R( c, d, e, a, b, F1, K1, X, 3);
+ R( b, c, d, e, a, F1, K1, X, 4); + R( a, b, c, d, e, F1, K1, X, 5); + R( e, a, b, c, d, F1, K1, X, 6); + R( d, e, a, b, c, F1, K1, X, 7); + R( c, d, e, a, b, F1, K1, X, 8);
+ R( b, c, d, e, a, F1, K1, X, 9);
+ R( a, b, c, d, e, F1, K1, X, 10); + R( e, a, b, c, d, F1, K1, X, 11); + R( d, e, a, b, c, F1, K1, X, 12); + R( c, d, e, a, b, F1, K1, X, 13); + R( b, c, d, e, a, F1, K1, X, 14);
+ R( a, b, c, d, e, F1, K1, X, 15);
+ R( e, a, b, c, d, F1, K1, M, 16); + R( d, e, a, b, c, F1, K1, M, 17); + R( c, d, e, a, b, F1, K1, M, 18); + R( b, c, d, e, a, F1, K1, M, 19); + R( a, b, c, d, e, F2, K2, M, 20);
+ R( e, a, b, c, d, F2, K2, M, 21);
+ R( d, e, a, b, c, F2, K2, M, 22); + R( c, d, e, a, b, F2, K2, M, 23); + R( b, c, d, e, a, F2, K2, M, 24); + R( a, b, c, d, e, F2, K2, M, 25); + R( e, a, b, c, d, F2, K2, M, 26);
+ R( d, e, a, b, c, F2, K2, M, 27);
+ R( c, d, e, a, b, F2, K2, M, 28); + R( b, c, d, e, a, F2, K2, M, 29); + R( a, b, c, d, e, F2, K2, M, 30); + R( e, a, b, c, d, F2, K2, M, 31); + R( d, e, a, b, c, F2, K2, M, 32);
+ R( c, d, e, a, b, F2, K2, M, 33);
+ R( b, c, d, e, a, F2, K2, M, 34); + R( a, b, c, d, e, F2, K2, M, 35); + R( e, a, b, c, d, F2, K2, M, 36); + R( d, e, a, b, c, F2, K2, M, 37); + R( c, d, e, a, b, F2, K2, M, 38);
+ R( b, c, d, e, a, F2, K2, M, 39);
+ R( a, b, c, d, e, F3, K3, M, 40); + R( e, a, b, c, d, F3, K3, M, 41); + R( d, e, a, b, c, F3, K3, M, 42); + R( c, d, e, a, b, F3, K3, M, 43); + R( b, c, d, e, a, F3, K3, M, 44);
+ R( a, b, c, d, e, F3, K3, M, 45);
+ R( e, a, b, c, d, F3, K3, M, 46); + R( d, e, a, b, c, F3, K3, M, 47); + R( c, d, e, a, b, F3, K3, M, 48); + R( b, c, d, e, a, F3, K3, M, 49); + R( a, b, c, d, e, F3, K3, M, 50);
+ R( e, a, b, c, d, F3, K3, M, 51);
+ R( d, e, a, b, c, F3, K3, M, 52); + R( c, d, e, a, b, F3, K3, M, 53); + R( b, c, d, e, a, F3, K3, M, 54); + R( a, b, c, d, e, F3, K3, M, 55); + R( e, a, b, c, d, F3, K3, M, 56);
+ R( d, e, a, b, c, F3, K3, M, 57);
+ R( c, d, e, a, b, F3, K3, M, 58); + R( b, c, d, e, a, F3, K3, M, 59); + R( a, b, c, d, e, F4, K4, M, 60); + R( e, a, b, c, d, F4, K4, M, 61); + R( d, e, a, b, c, F4, K4, M, 62);
+ R( c, d, e, a, b, F4, K4, M, 63);
+ R( b, c, d, e, a, F4, K4, M, 64); + R( a, b, c, d, e, F4, K4, M, 65); + R( e, a, b, c, d, F4, K4, M, 66); + R( d, e, a, b, c, F4, K4, M, 67); + R( c, d, e, a, b, F4, K4, M, 68);
+ R( b, c, d, e, a, F4, K4, M, 69);
+ R( a, b, c, d, e, F4, K4, M, 70); + R( e, a, b, c, d, F4, K4, M, 71); + R( d, e, a, b, c, F4, K4, M, 72); + R( c, d, e, a, b, F4, K4, M, 73); + R( b, c, d, e, a, F4, K4, M, 74);
+ R( a, b, c, d, e, F4, K4, M, 75);
+ R( e, a, b, c, d, F4, K4, M, 76); + R( d, e, a, b, c, F4, K4, M, 77); + R( c, d, e, a, b, F4, K4, M, 78); + R( b, c, d, e, a, F4, K4, M, 79);
a = ctx->A += a; b = ctx->B += b;
c = ctx->C += c; d = ctx->D += d; e = ctx->E += e; + words += 16; } }
I can't explain exactly how but this does have an effect: $ \time ./src/sha1sum < big_zero
1dd775261d7abab0b66910acc1d827a2c3799eaf - 3.06user 0.11system 0:03.17elapsed 99%CPU (0avgtext+0avgdata 2544maxresident)k 0inputs+0outputs (0major+197minor)pagefaults 0swaps
Then, the volatile addition which, according to Linus, is working great on architecture with not enough registers to accomodate the whole x[] array by preventing gcc from missing with the registers. This is a small patch:
-#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); x[I&0x0f] = TEMP;\ +#define R(A,B,C,D,E,F,K,M,I) do { unsigned int TEMP = M(I); *(volatile unsigned int *)&x[I&0x0f] = TEMP;\ E += rol( A, 5 ) \
+ F( B, C, D ) \ + K \
This does have a very big impact on speed: $ \time ./src/sha1sum < big_zero 1dd775261d7abab0b66910acc1d827a2c3799eaf -
2.57user 0.11system 0:02.69elapsed 99%CPU (0avgtext+0avgdata 2544maxresident)k 0inputs+0outputs (0major+196minor)pagefaults 0swaps
Eventually, the last performance stuff in Linus' sha1 is the use of inline asm for ror and rol instruction, but it seems it has no impact on performance for me, so I don't include it here.
Clearly, those patches are not clean, but I can had the necessary stuff so that ntohl and volatile write access are used when necessary by compilation directives like it is done in Linus' sha1.c.
My question is: would you consider such a patch for inclusion? is it the correct direction? Maybe, a better alternative is to just include block-sha1 from Linus directly as it seems that there is no license problems (http://lists.gnu.org/archive/html/bug-coreutils/2009-08/msg00136.html).
Also note that similar work can be done for sha256sum and sha512sum, I could work on this if it has a chance to be included.