... |
... |
@@ -1095,16 +1095,17 @@ typedef ptrdiff_t FT_PtrDist; |
1095
|
1095
|
return;
|
1096
|
1096
|
}
|
1097
|
1097
|
|
1098
|
|
- /* We can calculate the number of necessary bisections because */
|
|
1098
|
+ /* We can calculate the number of necessary segments because */
|
1099
|
1099
|
/* each bisection predictably reduces deviation exactly 4-fold. */
|
1100
|
1100
|
/* Even 32-bit deviation would vanish after 16 bisections. */
|
1101
|
|
- shift = 0;
|
|
1101
|
+ shift = 16;
|
1102
|
1102
|
do
|
1103
|
1103
|
{
|
1104
|
|
- dx >>= 2;
|
1105
|
|
- shift += 1;
|
|
1104
|
+ dx >>= 2;
|
|
1105
|
+ shift--;
|
1106
|
1106
|
|
1107
|
1107
|
} while ( dx > ONE_PIXEL / 4 );
|
|
1108
|
+ count = 0x10000U >> shift;
|
1108
|
1109
|
|
1109
|
1110
|
/*
|
1110
|
1111
|
* The (P0,P1,P2) arc equation, for t in [0,1] range:
|
... |
... |
@@ -1151,9 +1152,8 @@ typedef ptrdiff_t FT_PtrDist; |
1151
|
1152
|
*/
|
1152
|
1153
|
|
1153
|
1154
|
#if FT_SSE2
|
1154
|
|
- /* Experience shows that for small shift values, */
|
1155
|
|
- /* SSE2 is actually slower. */
|
1156
|
|
- if ( shift > 2 )
|
|
1155
|
+ /* Experience shows that for small counts, SSE2 is actually slower. */
|
|
1156
|
+ if ( count > 4 )
|
1157
|
1157
|
{
|
1158
|
1158
|
union
|
1159
|
1159
|
{
|
... |
... |
@@ -1169,9 +1169,7 @@ typedef ptrdiff_t FT_PtrDist; |
1169
|
1169
|
|
1170
|
1170
|
} v;
|
1171
|
1171
|
|
1172
|
|
- __m128i a, b;
|
1173
|
|
- __m128i r, q, q2;
|
1174
|
|
- __m128i p;
|
|
1172
|
+ __m128i p, q, r;
|
1175
|
1173
|
|
1176
|
1174
|
|
1177
|
1175
|
u.i.ax = ax;
|
... |
... |
@@ -1179,14 +1177,13 @@ typedef ptrdiff_t FT_PtrDist; |
1179
|
1177
|
u.i.bx = bx;
|
1180
|
1178
|
u.i.by = by;
|
1181
|
1179
|
|
1182
|
|
- a = _mm_load_si128( &u.vec.a );
|
1183
|
|
- b = _mm_load_si128( &u.vec.b );
|
|
1180
|
+ q = _mm_load_si128( &u.vec.b );
|
|
1181
|
+ r = _mm_load_si128( &u.vec.a );
|
1184
|
1182
|
|
1185
|
|
- r = _mm_slli_epi64( a, 33 - 2 * shift );
|
1186
|
|
- q = _mm_slli_epi64( b, 33 - shift );
|
1187
|
|
- q2 = _mm_slli_epi64( a, 32 - 2 * shift );
|
1188
|
|
-
|
1189
|
|
- q = _mm_add_epi64( q2, q );
|
|
1183
|
+ q = _mm_slli_epi64( q, shift + 17);
|
|
1184
|
+ r = _mm_slli_epi64( r, shift + shift );
|
|
1185
|
+ q = _mm_add_epi64( q, r );
|
|
1186
|
+ r = _mm_add_epi64( r, r );
|
1190
|
1187
|
|
1191
|
1188
|
v.i.px_lo = 0;
|
1192
|
1189
|
v.i.px_hi = p0.x;
|
... |
... |
@@ -1195,7 +1192,7 @@ typedef ptrdiff_t FT_PtrDist; |
1195
|
1192
|
|
1196
|
1193
|
p = _mm_load_si128( &v.vec );
|
1197
|
1194
|
|
1198
|
|
- for ( count = 1U << shift; count > 0; count-- )
|
|
1195
|
+ do
|
1199
|
1196
|
{
|
1200
|
1197
|
p = _mm_add_epi64( p, q );
|
1201
|
1198
|
q = _mm_add_epi64( q, r );
|
... |
... |
@@ -1203,22 +1200,25 @@ typedef ptrdiff_t FT_PtrDist; |
1203
|
1200
|
_mm_store_si128( &v.vec, p );
|
1204
|
1201
|
|
1205
|
1202
|
gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
|
1206
|
|
- }
|
|
1203
|
+ } while ( --count );
|
1207
|
1204
|
|
1208
|
1205
|
return;
|
1209
|
1206
|
}
|
1210
|
1207
|
#endif /* FT_SSE2 */
|
1211
|
1208
|
|
1212
|
|
- rx = LEFT_SHIFT( ax, 33 - 2 * shift );
|
1213
|
|
- ry = LEFT_SHIFT( ay, 33 - 2 * shift );
|
|
1209
|
+ rx = LEFT_SHIFT( ax, shift + shift );
|
|
1210
|
+ ry = LEFT_SHIFT( ay, shift + shift );
|
|
1211
|
+
|
|
1212
|
+ qx = LEFT_SHIFT( bx, shift + 17 ) + rx;
|
|
1213
|
+ qy = LEFT_SHIFT( by, shift + 17 ) + ry;
|
1214
|
1214
|
|
1215
|
|
- qx = LEFT_SHIFT( bx, 33 - shift ) + LEFT_SHIFT( ax, 32 - 2 * shift );
|
1216
|
|
- qy = LEFT_SHIFT( by, 33 - shift ) + LEFT_SHIFT( ay, 32 - 2 * shift );
|
|
1215
|
+ rx *= 2;
|
|
1216
|
+ ry *= 2;
|
1217
|
1217
|
|
1218
|
1218
|
px = LEFT_SHIFT( p0.x, 32 );
|
1219
|
1219
|
py = LEFT_SHIFT( p0.y, 32 );
|
1220
|
1220
|
|
1221
|
|
- for ( count = 1U << shift; count > 0; count-- )
|
|
1221
|
+ do
|
1222
|
1222
|
{
|
1223
|
1223
|
px += qx;
|
1224
|
1224
|
py += qy;
|
... |
... |
@@ -1227,7 +1227,7 @@ typedef ptrdiff_t FT_PtrDist; |
1227
|
1227
|
|
1228
|
1228
|
gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ),
|
1229
|
1229
|
(FT_Pos)( py >> 32 ) );
|
1230
|
|
- }
|
|
1230
|
+ } while ( --count );
|
1231
|
1231
|
}
|
1232
|
1232
|
|
1233
|
1233
|
#else /* !BEZIER_USE_DDA */
|