[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[lmi-commits] [lmi] master e5850c2a 12/13: Import Naohiko Shimizu's NetB
From: |
Greg Chicares |
Subject: |
[lmi-commits] [lmi] master e5850c2a 12/13: Import Naohiko Shimizu's NetBSD performance improvements |
Date: |
Mon, 23 May 2022 17:40:27 -0400 (EDT) |
branch: master
commit e5850c2a03126de2ad62545a3b673266a3efa591
Author: Gregory W. Chicares <gchicares@sbcglobal.net>
Commit: Gregory W. Chicares <gchicares@sbcglobal.net>
Import Naohiko Shimizu's NetBSD performance improvements
The canonical netlib.org sources don't incorporate these changes,
but they're in glibc. They account for the discrepancies observed
between the netlib.org originals and glibc:
x86_64, sse, gcc, posix
test_expm1_log1p: 1000000 trials
0 errors [worst 0.00000000000000000000000 ulp] in expm1()
0 errors [worst 0.00000000000000000000000 ulp] in log1p()
0 errors [worst 0.00000000000000000000000 ulp] in monthly int
0 errors [worst 0.00000000000000000000000 ulp] in daily int
and also for the speed difference, which was formerly:
lmi::expm1() 8.430e-04 s mean; 819 us least of 100 runs
std::expm1() 8.169e-04 s mean; 751 us least of 100 runs
lmi::log1p() 8.999e-04 s mean; 871 us least of 100 runs
std::log1p() 8.085e-04 s mean; 796 us least of 100 runs
but is now negligible:
lmi::expm1() 7.583e-04 s mean; 751 us least of 100 runs
std::expm1() 7.855e-04 s mean; 762 us least of 100 runs
lmi::log1p() 7.789e-04 s mean; 770 us least of 100 runs
std::log1p() 7.998e-04 s mean; 797 us least of 100 runs
---
fdlibm_expm1.c | 35 +++++++++++++++++++++++++++--------
fdlibm_log1p.c | 40 +++++++++++++++++++++++++++++++---------
2 files changed, 58 insertions(+), 17 deletions(-)
diff --git a/fdlibm_expm1.c b/fdlibm_expm1.c
index 3fcec962..db3c7a45 100644
--- a/fdlibm_expm1.c
+++ b/fdlibm_expm1.c
@@ -48,6 +48,12 @@
* ====================================================
*/
+// $NetBSD: s_expm1.c,v 1.8 1995/05/10 20:47:09 jtc Exp $
+/* Modified by Naohiko Shimizu/Tokai University, Japan 1997/08/25,
+ for performance improvement on pipelined processors.
+//
https://sourceware.org/git/?p=glibc.git;a=blobdiff;f=sysdeps/libm-ieee754/s_expm1.c;h=ed1aba527be837d4ed7e65e9c86b3fc3e14e65d7;hp=c8354b7262686370a761494dc785bb59fdf45477;hb=923609d1497f3116d57b297e3e84fc07b2b15b20;hpb=0d9f67937f0c9329c35c2c0d15848ab8316dc520
+*/
+
/* expm1(x)
* Returns exp(x)-1, the exponential of x minus 1.
*
@@ -144,7 +150,6 @@
*/
static const double
-one = 1.0,
huge = 1.0e+300,
tiny = 1.0e-300,
o_threshold = 7.09782712893383973096e+02,/* 0x40862E42, 0xFEFA39EF */
@@ -152,15 +157,24 @@ ln2_hi = 6.93147180369123816490e-01,/* 0x3fe62e42,
0xfee00000 */
ln2_lo = 1.90821492927058770002e-10,/* 0x3dea39ef, 0x35793c76 */
invln2 = 1.44269504088896338700e+00,/* 0x3ff71547, 0x652b82fe */
/* scaled coefficients related to expm1 */
-Q1 = -3.33333333333331316428e-02, /* BFA11111 111110F4 */
-Q2 = 1.58730158725481460165e-03, /* 3F5A01A0 19FE5585 */
-Q3 = -7.93650757867487942473e-05, /* BF14CE19 9EAADBB7 */
-Q4 = 4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
-Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
+Q[] = { 1.0
+ ,-3.33333333333331316428e-02 /* BFA11111 111110F4 */
+ , 1.58730158725481460165e-03 /* 3F5A01A0 19FE5585 */
+ ,-7.93650757867487942473e-05 /* BF14CE19 9EAADBB7 */
+ , 4.00821782732936239552e-06 /* 3ED0CFCA 86E65239 */
+ ,-2.01099218183624371326e-07 /* BE8AFDB7 6E09C32D */
+};
+
+#define one Q[0]
+#define Q1 Q[1]
+#define Q2 Q[2]
+#define Q3 Q[3]
+#define Q4 Q[4]
+#define Q5 Q[5]
double fdlibm_expm1(double x)
{
- double y,hi,lo,c,t,e,hxs,hfx,r1;
+ double y,hi,lo,c,t,e,hxs,hfx,r1,h2,h4,R1,R2,R3;
int32_t k,xsb;
uint32_t hx;
@@ -210,7 +224,12 @@ double fdlibm_expm1(double x)
/* x is now in primary range */
hfx = 0.5*x;
hxs = x*hfx;
- r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+// performance improvement: Naohiko Shimizu 19970825
+// r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ R1 = one+hxs*Q1; h2 = hxs*hxs;
+ R2 = Q2+hxs*Q3; h4 = h2*h2;
+ R3 = Q4+hxs*Q5;
+ r1 = R1 + h2*R2 + h4*R3;
t = 3.0-r1*hfx;
e = hxs*((r1-t)/(6.0 - x*t));
if(k==0) return x - (x*e-hxs); /* c is 0 */
diff --git a/fdlibm_log1p.c b/fdlibm_log1p.c
index ec6a1499..608ba12b 100644
--- a/fdlibm_log1p.c
+++ b/fdlibm_log1p.c
@@ -47,6 +47,12 @@
* ====================================================
*/
+// $NetBSD: s_log1p.c,v 1.8 1995/05/10 20:47:46 jtc Exp $
+/* Modified by Naohiko Shimizu/Tokai University, Japan 1997/08/25,
+ for performance improvement on pipelined processors.
+//
https://sourceware.org/git/?p=glibc.git;a=blobdiff;f=sysdeps/libm-ieee754/s_log1p.c;h=4ca01b1ff555693a3fa3459df82bef1c24c82668;hp=086c0dce6c118e2364aa883d0997d32493d43a97;hb=923609d1497f3116d57b297e3e84fc07b2b15b20;hpb=0d9f67937f0c9329c35c2c0d15848ab8316dc520
+*/
+
/* double log1p(double x)
*
* Method :
@@ -116,19 +122,29 @@ static const double
ln2_hi = 6.93147180369123816490e-01, /* 3fe62e42 fee00000 */
ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */
two54 = 1.80143985094819840000e+16, /* 43500000 00000000 */
-Lp1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
-Lp2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
-Lp3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
-Lp4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
-Lp5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
-Lp6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
-Lp7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
+Lp[] = {0.0 // not used
+ ,6.666666666666735130e-01 /* 3FE55555 55555593 */
+ ,3.999999999940941908e-01 /* 3FD99999 9997FA04 */
+ ,2.857142874366239149e-01 /* 3FD24924 94229359 */
+ ,2.222219843214978396e-01 /* 3FCC71C5 1D8E78AF */
+ ,1.818357216161805012e-01 /* 3FC74664 96CB03DE */
+ ,1.531383769920937332e-01 /* 3FC39A09 D078C69F */
+ ,1.479819860511658591e-01 /* 3FC2F112 DF3E5244 */
+};
+
+#define Lp1 Lp[1]
+#define Lp2 Lp[2]
+#define Lp3 Lp[3]
+#define Lp4 Lp[4]
+#define Lp5 Lp[5]
+#define Lp6 Lp[6]
+#define Lp7 Lp[7]
static double zero = 0.0;
double fdlibm_log1p(double x)
{
- double hfsq,f,c,s,z,R,u;
+ double hfsq,f,c,s,z,R,u,z2,z4,z6,R1,R2,R3,R4;
int32_t k,hx,hu,ax;
hx = FDLIBM_HI(x); /* high word of x */
@@ -187,7 +203,13 @@ double fdlibm_log1p(double x)
}
s = f/(2.0+f);
z = s*s;
- R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+// performance improvement: Naohiko Shimizu 19970825
+// R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+ R1 = z*Lp1; z2=z*z;
+ R2 = Lp2+z*Lp3; z4=z2*z2;
+ R3 = Lp4+z*Lp5; z6=z4*z2;
+ R4 = Lp6+z*Lp7;
+ R = R1 + z2*R2 + z4*R3 + z6*R4;
if(k==0) return f-(hfsq-s*(hfsq+R)); else
return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
}
- [lmi-commits] [lmi] master updated (ad1f8a2f -> 599c44b7), Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 38dd359d 04/13: Apply 'fdlibm*.c' changes to 'fdlibm.h', Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master a456d40d 03/13: Expunge unwanted part of fdlibm header, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 9d7e3be1 02/13: Import header from fdlibm, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master e5850c2a 12/13: Import Naohiko Shimizu's NetBSD performance improvements,
Greg Chicares <=
- [lmi-commits] [lmi] master bfb73bb2 09/13: Use fdlibm header more broadly, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 599c44b7 13/13: Draw attention to atrocious type puns, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 164db076 10/13: Assume __STDC__ for fdlibm, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 7d5c739b 01/13: Expunge unused declarations and macro, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master d7294669 06/13: Add include guards; include a header required by lmi, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master adf263fe 11/13: Assume __STDC__ everywhere appropriate, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master edb56e9b 05/13: Add license and copyright boilerplate, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 50b2645a 08/13: Use fdlibm header, Greg Chicares, 2022/05/23
- [lmi-commits] [lmi] master 89966e80 07/13: Rename fdlibm header, Greg Chicares, 2022/05/23