freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[freetype2] master 7b308a29d: * src/smooth/ftgrays.c (gray_render_conic)


From: Werner Lemberg
Subject: [freetype2] master 7b308a29d: * src/smooth/ftgrays.c (gray_render_conic) [SSE2]: Improve flow.
Date: Thu, 28 Sep 2023 22:58:46 -0400 (EDT)

branch: master
commit 7b308a29dd105074eea9c8d5953a182d325f74f1
Author: Alexei Podtelezhnikov <apodtele@gmail.com>
Commit: Alexei Podtelezhnikov <apodtele@gmail.com>

    * src/smooth/ftgrays.c (gray_render_conic) [SSE2]: Improve flow.
---
 src/smooth/ftgrays.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c
index 0918272f8..fbf1541ef 100644
--- a/src/smooth/ftgrays.c
+++ b/src/smooth/ftgrays.c
@@ -1095,16 +1095,17 @@ typedef ptrdiff_t  FT_PtrDist;
       return;
     }
 
-    /* We can calculate the number of necessary bisections because  */
+    /* We can calculate the number of necessary segments because    */
     /* each bisection predictably reduces deviation exactly 4-fold. */
     /* Even 32-bit deviation would vanish after 16 bisections.      */
-    shift = 0;
+    shift = 16;
     do
     {
-      dx   >>= 2;
-      shift += 1;
+      dx >>= 2;
+      shift--;
 
     } while ( dx > ONE_PIXEL / 4 );
+    count = 0x10000U >> shift;
 
     /*
      * The (P0,P1,P2) arc equation, for t in [0,1] range:
@@ -1151,9 +1152,8 @@ typedef ptrdiff_t  FT_PtrDist;
      */
 
 #if FT_SSE2
-    /* Experience shows that for small shift values, */
-    /* SSE2 is actually slower.                      */
-    if ( shift > 2 )
+    /* Experience shows that for small counts, SSE2 is actually slower. */
+    if ( count > 4 )
     {
       union
       {
@@ -1169,9 +1169,7 @@ typedef ptrdiff_t  FT_PtrDist;
 
       } v;
 
-      __m128i  a, b;
-      __m128i  r, q, q2;
-      __m128i  p;
+      __m128i  p, q, r;
 
 
       u.i.ax = ax;
@@ -1179,14 +1177,13 @@ typedef ptrdiff_t  FT_PtrDist;
       u.i.bx = bx;
       u.i.by = by;
 
-      a = _mm_load_si128( &u.vec.a );
-      b = _mm_load_si128( &u.vec.b );
+      q = _mm_load_si128( &u.vec.b );
+      r = _mm_load_si128( &u.vec.a );
 
-      r  = _mm_slli_epi64( a, 33 - 2 * shift );
-      q  = _mm_slli_epi64( b, 33 - shift );
-      q2 = _mm_slli_epi64( a, 32 - 2 * shift );
-
-      q = _mm_add_epi64( q2, q );
+      q = _mm_slli_epi64( q, shift + 17);
+      r = _mm_slli_epi64( r, shift + shift );
+      q = _mm_add_epi64( q, r );
+      r = _mm_add_epi64( r, r );
 
       v.i.px_lo = 0;
       v.i.px_hi = p0.x;
@@ -1195,7 +1192,7 @@ typedef ptrdiff_t  FT_PtrDist;
 
       p = _mm_load_si128( &v.vec );
 
-      for ( count = 1U << shift; count > 0; count-- )
+      do
       {
         p = _mm_add_epi64( p, q );
         q = _mm_add_epi64( q, r );
@@ -1203,22 +1200,25 @@ typedef ptrdiff_t  FT_PtrDist;
         _mm_store_si128( &v.vec, p );
 
         gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
-      }
+      } while ( --count );
 
       return;
     }
 #endif  /* FT_SSE2 */
 
-    rx = LEFT_SHIFT( ax, 33 - 2 * shift );
-    ry = LEFT_SHIFT( ay, 33 - 2 * shift );
+    rx = LEFT_SHIFT( ax, shift + shift );
+    ry = LEFT_SHIFT( ay, shift + shift );
+
+    qx = LEFT_SHIFT( bx, shift + 17 ) + rx;
+    qy = LEFT_SHIFT( by, shift + 17 ) + ry;
 
-    qx = LEFT_SHIFT( bx, 33 - shift ) + LEFT_SHIFT( ax, 32 - 2 * shift );
-    qy = LEFT_SHIFT( by, 33 - shift ) + LEFT_SHIFT( ay, 32 - 2 * shift );
+    rx *= 2;
+    ry *= 2;
 
     px = LEFT_SHIFT( p0.x, 32 );
     py = LEFT_SHIFT( p0.y, 32 );
 
-    for ( count = 1U << shift; count > 0; count-- )
+    do
     {
       px += qx;
       py += qy;
@@ -1227,7 +1227,7 @@ typedef ptrdiff_t  FT_PtrDist;
 
       gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ),
                                  (FT_Pos)( py >> 32 ) );
-    }
+    } while ( --count );
   }
 
 #else  /* !BEZIER_USE_DDA */



reply via email to

[Prev in Thread] Current Thread [Next in Thread]