@@ -317,20 +317,19 @@ static void SDL_TARGETING("sse") calculate_distance_attenuation_and_angle_sse(co
317
317
#if defined(SDL_NEON_INTRINSICS )
318
318
static float32x4_t xyzzy_neon (const float32x4_t a , const float32x4_t b )
319
319
{
320
- const float32x4_t shuf_a = { a [1 ], a [2 ], a [0 ], a [3 ] };
321
- const float32x4_t shuf_b = { b [1 ], b [2 ], b [0 ], b [3 ] };
322
- const float32x4_t v = vsubq_f32 (vmulq_f32 (a , shuf_b ), vmulq_f32 (b , shuf_a ));
323
- const float32x4_t retval = { v [1 ], v [2 ], v [0 ], v [3 ] };
324
- FIXME ("need a better permute" );
325
- return retval ;
320
+ const float32x4_t a_yzx = vcopyq_laneq_f32 (vextq_f32 (a , a , 1 ), 2 , a , 0 );
321
+ const float32x4_t b_yzx = vcopyq_laneq_f32 (vextq_f32 (b , b , 1 ), 2 , b , 0 );
322
+ const float32x4_t c = vsubq_f32 (vmulq_f32 (a , b_yzx ), vmulq_f32 (b , a_yzx ));
323
+ const float32x4_t r = vcopyq_laneq_f32 (vextq_f32 (c , c , 1 ), 2 , c , 0 );
324
+ return vsetq_lane_f32 (0 , r , 3 );
326
325
}
327
326
328
327
static float dotproduct_neon (const float32x4_t a , const float32x4_t b )
329
328
{
330
329
const float32x4_t prod = vmulq_f32 (a , b );
331
330
const float32x4_t sum1 = vaddq_f32 (prod , vrev64q_f32 (prod ));
332
331
const float32x4_t sum2 = vaddq_f32 (sum1 , vcombine_f32 (vget_high_f32 (sum1 ), vget_low_f32 (sum1 )));
333
- return sum2 [ 3 ] ;
332
+ return vgetq_lane_f32 ( sum2 , 3 ) ;
334
333
}
335
334
336
335
static float magnitude_neon (const float32x4_t v )
0 commit comments