Re: [PATCH] ppc: Use hard-float in ppc fp_hlper as early as possible. Th

Bench result;

orignal:
-> FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 27.768 sec
MFLOPS: 38.65
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 28.359 sec
MFLOPS: 37.84

soft-hard-float:

GCC version: 4.3.3
Ops count: 1073217024
Time spent: 14.874 sec
MFLOPS: 72.15
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 14.249 sec
MFLOPS: 75.32

direct-hard-float:

-> FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 13.021 sec
MFLOPS: 82.42
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 12.472 sec
MFLOPS: 86.05
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 11.803 sec
MFLOPS: 90.93
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 11.945 sec
MFLOPS: 89.85

bench program:

```

#include <stdio.h>
#include <stdlib.h>
#ifdef __vxworks
#include <sys/resource.h>
#include <vxworks.h>
#include <timers.h>
#include <time.h>
#elif defined(_MSC_VER)
#include <Windows.h>
#include <time.h>
#else
#include <time.h>
#endif
/*
cl -O2 test_flops.c
gcc -O2 test_flops.c -o test_flops

*/
#ifndef DIM
#define DIM 1024
const long long int nop = 1073217024;
#else
#define COUNT
long long int nop = 0;
#endif

void printm(double A[DIM][DIM])
{
int i,j;
for (i=0; i<DIM; i++) {
for (j=0; j<DIM; j++)
printf("%6.3f", A[i][j]);
printf("\n");
}
}

void initm(double A[DIM][DIM])
{
int i,j;
srand(38741);
for (i = 0; i < DIM; i++)
for (j = 0; j < DIM; j++)
A[i][j] = (double)rand() / (double)RAND_MAX - 0.5;
}

void dge(double A[DIM][DIM])
{
int i, j, k;
double c;
for (k = 1; k < DIM; k++) {
for (i = k; i < DIM; i++) {
c = A[i][k-1] / A[k-1][k-1];
#ifdef COUNT
nop += 1;
#endif
for (j = 0; j < DIM; j++) {
A[i][j] -= c * A[k-1][j];
#ifdef COUNT
nop += 2;
#endif
}
}
}
}

double X[DIM][DIM];

/*
* return a timestamp with sub-second precision
* QueryPerformanceCounter and clock_gettime have an undefined starting point (null/zero)
* and can wrap around, i.e. be nulled again.
*/
double get_seconds()
{
#ifdef _MSC_VER
static LARGE_INTEGER frequency;
if (frequency.QuadPart == 0)
QueryPerformanceFrequency(&frequency);
LARGE_INTEGER now;
QueryPerformanceCounter(&now);
return (now.QuadPart * 1.0) / frequency.QuadPart;
#else
struct timespec now;
clock_gettime(CLOCK_REALTIME, &now);
return now.tv_sec + now.tv_nsec * 1e-9;
#endif
}

int main (int argc, char **argv)
{
double a = 1.0;
double b = 2.0;
double c = a + b;
double t;
int count = 1;
int i;
printf("FLOPS %.2lf\n", c);
#ifdef _MSC_VER
printf("MSC_VER version: %d\n", _MSC_VER);
#else
printf("GCC version: " __VERSION__ "\n");
#endif
initm(X);
t = get_seconds();
#ifndef __vxworks
if (argc > 1) {
sscanf(argv[1], "%d", &count);
}
#endif
for (i = 0; i < count; i += 1) {
dge(X);
}
t = get_seconds() - t;
printf("Ops count: %llu\n", nop * count);
printf("Time spent: %.3lf sec\n", t);
printf("MFLOPS: %.2f\n", 1e-6 * nop * count / t );
#ifdef PRINTM
printm(X);
#endif
return 0;
}

```

On Tue, May 5, 2020 at 3:30 AM <address@hidden> wrote:

From: Yonggang Luo <address@hidden>

Just post as an idea to improve PPC fp performance.
With this idea, we have no need to adjust the helper orders.

Signed-off-by: Yonggang Luo <address@hidden>
---
target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 2bd49a2cdf..79051e4540 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env, bool set_fpcc,
/* fadd - fadd. */
float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
{
+ CPU_DoubleU u1, u2;
+
+ u1.d = arg1;
+ u2.d = arg2;
+ CPU_DoubleU retDouble;
+ retDouble.nd = u1.nd + u2.nd;
+ if (likely(float64_is_zero_or_normal(retDouble.d)))
+ {
+ /* TODO: Handling inexact */
+ return retDouble.d;
+ }
float64 ret = float64_add(arg1, arg2, &env->fp_status);
int status = get_float_exception_flags(&env->fp_status);

@@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
/* fsub - fsub. */
float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2)
{
+ CPU_DoubleU u1, u2;
+
+ u1.d = arg1;
+ u2.d = arg2;
+ CPU_DoubleU retDouble;
+ retDouble.nd = u1.nd - u2.nd;
+ if (likely(float64_is_zero_or_normal(retDouble.d)))
+ {
+ /* TODO: Handling inexact */
+ return retDouble.d;
+ }
float64 ret = float64_sub(arg1, arg2, &env->fp_status);
int status = get_float_exception_flags(&env->fp_status);

@@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env, bool set_fprc,
/* fmul - fmul. */
float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2)
{
+ CPU_DoubleU u1, u2;
+
+ u1.d = arg1;
+ u2.d = arg2;
+ CPU_DoubleU retDouble;
+ retDouble.nd = u1.nd * u2.nd;
+ if (likely(float64_is_zero_or_normal(retDouble.d)))
+ {
+ /* TODO: Handling inexact */
+ return retDouble.d;
+ }
float64 ret = float64_mul(arg1, arg2, &env->fp_status);
int status = get_float_exception_flags(&env->fp_status);

@@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env, bool set_fprc,
/* fdiv - fdiv. */
float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2)
{
+ CPU_DoubleU u1, u2;
+
+ u1.d = arg1;
+ u2.d = arg2;
+ CPU_DoubleU retDouble;
+ retDouble.nd = u1.nd / u2.nd;
+ if (likely(float64_is_zero_or_normal(retDouble.d)))
+ {
+ /* TODO: Handling inexact */
+ return retDouble.d;
+ }
float64 ret = float64_div(arg1, arg2, &env->fp_status);
int status = get_float_exception_flags(&env->fp_status);

--
2.23.0.windows.1

From:	Yonggang Luo
Subject:	Re: [PATCH] ppc: Use hard-float in ppc fp_hlper as early as possible. This would increase the performance better than enable hard-float it in soft-float.c; Just using fadd fsub fmul fdiv as a simple bench demo. With this patch, performance are increased 2x. and 1.3x than the one enable hard-float in soft-float.c Both version are not considerate inexact fp exception yet.
Date:	Tue, 5 May 2020 04:02:12 +0800