freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][master] * src/smooth/ftgrays.c (gray_render_con


From: Alexei Podtelezhnikov (@apodtele)
Subject: [Git][freetype/freetype][master] * src/smooth/ftgrays.c (gray_render_conic) [SSE2]: Improve flow.
Date: Fri, 29 Sep 2023 02:58:40 +0000

Alexei Podtelezhnikov pushed to branch master at FreeType / FreeType

Commits:

  • 7b308a29
    by Alexei Podtelezhnikov at 2023-09-28T22:56:15-04:00
    * src/smooth/ftgrays.c (gray_render_conic) [SSE2]: Improve flow.
    

1 changed file:

Changes:

  • src/smooth/ftgrays.c
    ... ... @@ -1095,16 +1095,17 @@ typedef ptrdiff_t FT_PtrDist;
    1095 1095
           return;
    
    1096 1096
         }
    
    1097 1097
     
    
    1098
    -    /* We can calculate the number of necessary bisections because  */
    
    1098
    +    /* We can calculate the number of necessary segments because    */
    
    1099 1099
         /* each bisection predictably reduces deviation exactly 4-fold. */
    
    1100 1100
         /* Even 32-bit deviation would vanish after 16 bisections.      */
    
    1101
    -    shift = 0;
    
    1101
    +    shift = 16;
    
    1102 1102
         do
    
    1103 1103
         {
    
    1104
    -      dx   >>= 2;
    
    1105
    -      shift += 1;
    
    1104
    +      dx >>= 2;
    
    1105
    +      shift--;
    
    1106 1106
     
    
    1107 1107
         } while ( dx > ONE_PIXEL / 4 );
    
    1108
    +    count = 0x10000U >> shift;
    
    1108 1109
     
    
    1109 1110
         /*
    
    1110 1111
          * The (P0,P1,P2) arc equation, for t in [0,1] range:
    
    ... ... @@ -1151,9 +1152,8 @@ typedef ptrdiff_t FT_PtrDist;
    1151 1152
          */
    
    1152 1153
     
    
    1153 1154
     #if FT_SSE2
    
    1154
    -    /* Experience shows that for small shift values, */
    
    1155
    -    /* SSE2 is actually slower.                      */
    
    1156
    -    if ( shift > 2 )
    
    1155
    +    /* Experience shows that for small counts, SSE2 is actually slower. */
    
    1156
    +    if ( count > 4 )
    
    1157 1157
         {
    
    1158 1158
           union
    
    1159 1159
           {
    
    ... ... @@ -1169,9 +1169,7 @@ typedef ptrdiff_t FT_PtrDist;
    1169 1169
     
    
    1170 1170
           } v;
    
    1171 1171
     
    
    1172
    -      __m128i  a, b;
    
    1173
    -      __m128i  r, q, q2;
    
    1174
    -      __m128i  p;
    
    1172
    +      __m128i  p, q, r;
    
    1175 1173
     
    
    1176 1174
     
    
    1177 1175
           u.i.ax = ax;
    
    ... ... @@ -1179,14 +1177,13 @@ typedef ptrdiff_t FT_PtrDist;
    1179 1177
           u.i.bx = bx;
    
    1180 1178
           u.i.by = by;
    
    1181 1179
     
    
    1182
    -      a = _mm_load_si128( &u.vec.a );
    
    1183
    -      b = _mm_load_si128( &u.vec.b );
    
    1180
    +      q = _mm_load_si128( &u.vec.b );
    
    1181
    +      r = _mm_load_si128( &u.vec.a );
    
    1184 1182
     
    
    1185
    -      r  = _mm_slli_epi64( a, 33 - 2 * shift );
    
    1186
    -      q  = _mm_slli_epi64( b, 33 - shift );
    
    1187
    -      q2 = _mm_slli_epi64( a, 32 - 2 * shift );
    
    1188
    -
    
    1189
    -      q = _mm_add_epi64( q2, q );
    
    1183
    +      q = _mm_slli_epi64( q, shift + 17);
    
    1184
    +      r = _mm_slli_epi64( r, shift + shift );
    
    1185
    +      q = _mm_add_epi64( q, r );
    
    1186
    +      r = _mm_add_epi64( r, r );
    
    1190 1187
     
    
    1191 1188
           v.i.px_lo = 0;
    
    1192 1189
           v.i.px_hi = p0.x;
    
    ... ... @@ -1195,7 +1192,7 @@ typedef ptrdiff_t FT_PtrDist;
    1195 1192
     
    
    1196 1193
           p = _mm_load_si128( &v.vec );
    
    1197 1194
     
    
    1198
    -      for ( count = 1U << shift; count > 0; count-- )
    
    1195
    +      do
    
    1199 1196
           {
    
    1200 1197
             p = _mm_add_epi64( p, q );
    
    1201 1198
             q = _mm_add_epi64( q, r );
    
    ... ... @@ -1203,22 +1200,25 @@ typedef ptrdiff_t FT_PtrDist;
    1203 1200
             _mm_store_si128( &v.vec, p );
    
    1204 1201
     
    
    1205 1202
             gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
    
    1206
    -      }
    
    1203
    +      } while ( --count );
    
    1207 1204
     
    
    1208 1205
           return;
    
    1209 1206
         }
    
    1210 1207
     #endif  /* FT_SSE2 */
    
    1211 1208
     
    
    1212
    -    rx = LEFT_SHIFT( ax, 33 - 2 * shift );
    
    1213
    -    ry = LEFT_SHIFT( ay, 33 - 2 * shift );
    
    1209
    +    rx = LEFT_SHIFT( ax, shift + shift );
    
    1210
    +    ry = LEFT_SHIFT( ay, shift + shift );
    
    1211
    +
    
    1212
    +    qx = LEFT_SHIFT( bx, shift + 17 ) + rx;
    
    1213
    +    qy = LEFT_SHIFT( by, shift + 17 ) + ry;
    
    1214 1214
     
    
    1215
    -    qx = LEFT_SHIFT( bx, 33 - shift ) + LEFT_SHIFT( ax, 32 - 2 * shift );
    
    1216
    -    qy = LEFT_SHIFT( by, 33 - shift ) + LEFT_SHIFT( ay, 32 - 2 * shift );
    
    1215
    +    rx *= 2;
    
    1216
    +    ry *= 2;
    
    1217 1217
     
    
    1218 1218
         px = LEFT_SHIFT( p0.x, 32 );
    
    1219 1219
         py = LEFT_SHIFT( p0.y, 32 );
    
    1220 1220
     
    
    1221
    -    for ( count = 1U << shift; count > 0; count-- )
    
    1221
    +    do
    
    1222 1222
         {
    
    1223 1223
           px += qx;
    
    1224 1224
           py += qy;
    
    ... ... @@ -1227,7 +1227,7 @@ typedef ptrdiff_t FT_PtrDist;
    1227 1227
     
    
    1228 1228
           gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ),
    
    1229 1229
                                      (FT_Pos)( py >> 32 ) );
    
    1230
    -    }
    
    1230
    +    } while ( --count );
    
    1231 1231
       }
    
    1232 1232
     
    
    1233 1233
     #else  /* !BEZIER_USE_DDA */
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]