|
10 | 10 | // - Henry de Valence <hdevalence@hdevalence.ca>
|
11 | 11 | // - Robrecht Blancquaert <Robrecht.Simon.Blancquaert@vub.be>
|
12 | 12 |
|
13 |
| -//! More details on the algorithms can be found in the `avx2` |
14 |
| -//! module. Here comments are mostly added only when needed |
15 |
| -//! to explain differenes between the 'base' avx2 version and |
| 13 | +//! More details on the algorithms can be found in the `avx2` |
| 14 | +//! module. Here comments are mostly added only when needed |
| 15 | +//! to explain differenes between the 'base' avx2 version and |
16 | 16 | //! this re-implementation for arm neon.
|
17 | 17 |
|
18 | 18 | //! The most major difference is the split of one vector of 8
|
@@ -61,10 +61,10 @@ fn repack_pair(x: (u32x4, u32x4), y: (u32x4, u32x4)) -> (u32x4, u32x4) {
|
61 | 61 | use core::arch::aarch64::vgetq_lane_u32;
|
62 | 62 |
|
63 | 63 | (vcombine_u32(
|
64 |
| - vset_lane_u32(vgetq_lane_u32(x.0.into_bits(), 2) , vget_low_u32(x.0.into_bits()), 1), |
65 |
| - vset_lane_u32(vgetq_lane_u32(y.0.into_bits(), 2) , vget_low_u32(y.0.into_bits()), 1)).into_bits(), |
| 64 | + vset_lane_u32(vgetq_lane_u32(x.0.into_bits(), 2) , vget_low_u32(x.0.into_bits()), 1), |
| 65 | + vset_lane_u32(vgetq_lane_u32(y.0.into_bits(), 2) , vget_low_u32(y.0.into_bits()), 1)).into_bits(), |
66 | 66 | vcombine_u32(
|
67 |
| - vset_lane_u32(vgetq_lane_u32(x.1.into_bits(), 2) , vget_low_u32(x.1.into_bits()), 1), |
| 67 | + vset_lane_u32(vgetq_lane_u32(x.1.into_bits(), 2) , vget_low_u32(x.1.into_bits()), 1), |
68 | 68 | vset_lane_u32(vgetq_lane_u32(y.1.into_bits(), 2) , vget_low_u32(y.1.into_bits()), 1)).into_bits())
|
69 | 69 | }
|
70 | 70 | }
|
@@ -100,16 +100,16 @@ macro_rules! lane_shuffle {
|
100 | 100 | unsafe {
|
101 | 101 | use core::arch::aarch64::vgetq_lane_u32;
|
102 | 102 | const c: [i32; 8] = [$l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7];
|
103 |
| - (u32x4::new(if c[0] < 4 { vgetq_lane_u32($x.0.into_bits(), c[0]) } else { vgetq_lane_u32($x.1.into_bits(), c[0] - 4) }, |
104 |
| - if c[1] < 4 { vgetq_lane_u32($x.0.into_bits(), c[1]) } else { vgetq_lane_u32($x.1.into_bits(), c[1] - 4) }, |
105 |
| - if c[2] < 4 { vgetq_lane_u32($x.0.into_bits(), c[2]) } else { vgetq_lane_u32($x.1.into_bits(), c[2] - 4) }, |
| 103 | + (u32x4::new(if c[0] < 4 { vgetq_lane_u32($x.0.into_bits(), c[0]) } else { vgetq_lane_u32($x.1.into_bits(), c[0] - 4) }, |
| 104 | + if c[1] < 4 { vgetq_lane_u32($x.0.into_bits(), c[1]) } else { vgetq_lane_u32($x.1.into_bits(), c[1] - 4) }, |
| 105 | + if c[2] < 4 { vgetq_lane_u32($x.0.into_bits(), c[2]) } else { vgetq_lane_u32($x.1.into_bits(), c[2] - 4) }, |
106 | 106 | if c[3] < 4 { vgetq_lane_u32($x.0.into_bits(), c[3]) } else { vgetq_lane_u32($x.1.into_bits(), c[3] - 4) }),
|
107 |
| - u32x4::new(if c[4] < 4 { vgetq_lane_u32($x.0.into_bits(), c[4]) } else { vgetq_lane_u32($x.1.into_bits(), c[4] - 4) }, |
108 |
| - if c[5] < 4 { vgetq_lane_u32($x.0.into_bits(), c[5]) } else { vgetq_lane_u32($x.1.into_bits(), c[5] - 4) }, |
109 |
| - if c[6] < 4 { vgetq_lane_u32($x.0.into_bits(), c[6]) } else { vgetq_lane_u32($x.1.into_bits(), c[6] - 4) }, |
| 107 | + u32x4::new(if c[4] < 4 { vgetq_lane_u32($x.0.into_bits(), c[4]) } else { vgetq_lane_u32($x.1.into_bits(), c[4] - 4) }, |
| 108 | + if c[5] < 4 { vgetq_lane_u32($x.0.into_bits(), c[5]) } else { vgetq_lane_u32($x.1.into_bits(), c[5] - 4) }, |
| 109 | + if c[6] < 4 { vgetq_lane_u32($x.0.into_bits(), c[6]) } else { vgetq_lane_u32($x.1.into_bits(), c[6] - 4) }, |
110 | 110 | if c[7] < 4 { vgetq_lane_u32($x.0.into_bits(), c[7]) } else { vgetq_lane_u32($x.1.into_bits(), c[7] - 4) }))
|
111 | 111 | }
|
112 |
| - |
| 112 | + |
113 | 113 | }
|
114 | 114 | }
|
115 | 115 |
|
@@ -161,14 +161,14 @@ impl FieldElement2625x4 {
|
161 | 161 | pub fn split(&self) -> [FieldElement51; 4] {
|
162 | 162 | let mut out = [FieldElement51::zero(); 4];
|
163 | 163 | for i in 0..5 {
|
164 |
| - let a_2i = self.0[i].0.extract(0) as u64; |
165 |
| - let b_2i = self.0[i].0.extract(1) as u64; |
166 |
| - let a_2i_1 = self.0[i].0.extract(2) as u64; |
| 164 | + let a_2i = self.0[i].0.extract(0) as u64; |
| 165 | + let b_2i = self.0[i].0.extract(1) as u64; |
| 166 | + let a_2i_1 = self.0[i].0.extract(2) as u64; |
167 | 167 | let b_2i_1 = self.0[i].0.extract(3) as u64;
|
168 | 168 | let c_2i = self.0[i].1.extract(0) as u64;
|
169 |
| - let d_2i = self.0[i].1.extract(1) as u64; |
170 |
| - let c_2i_1 = self.0[i].1.extract(2) as u64; |
171 |
| - let d_2i_1 = self.0[i].1.extract(3) as u64; |
| 169 | + let d_2i = self.0[i].1.extract(1) as u64; |
| 170 | + let c_2i_1 = self.0[i].1.extract(2) as u64; |
| 171 | + let d_2i_1 = self.0[i].1.extract(3) as u64; |
172 | 172 |
|
173 | 173 | out[0].0[i] = a_2i + (a_2i_1 << 26);
|
174 | 174 | out[1].0[i] = b_2i + (b_2i_1 << 26);
|
@@ -212,33 +212,28 @@ impl FieldElement2625x4 {
|
212 | 212 | #[inline(always)]
|
213 | 213 | fn blend_lanes(x: (u32x4, u32x4), y: (u32x4, u32x4), control: Lanes) -> (u32x4, u32x4) {
|
214 | 214 | unsafe {
|
215 |
| - use core::arch::aarch64::vqtbx1q_u8; |
| 215 | + use packed_simd::shuffle; |
216 | 216 | match control {
|
217 | 217 | Lanes::C => {
|
218 |
| - (x.0, |
219 |
| - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits()) |
| 218 | + (x.0, shuffle!(y.1, x.1, [0, 5, 2, 7])) |
220 | 219 | }
|
221 | 220 | Lanes::D => {
|
222 |
| - (x.0, |
223 |
| - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits()).into_bits()) |
| 221 | + (x.0, shuffle!(y.1, x.1, [4, 1, 6, 3])) |
224 | 222 | }
|
225 | 223 | Lanes::AD => {
|
226 |
| - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits() ).into_bits(), |
227 |
| - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits() ).into_bits()) |
| 224 | + (shuffle!(y.0, x.0, [0, 5, 2, 7]), shuffle!(y.1, x.1, [4, 1, 6, 3])) |
228 | 225 | }
|
229 | 226 | Lanes::AB => {
|
230 | 227 | (y.0, x.1)
|
231 | 228 | }
|
232 | 229 | Lanes::AC => {
|
233 |
| - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits(), |
234 |
| - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits()).into_bits()) |
| 230 | + (shuffle!(y.0, x.0, [0, 5, 2, 7]), shuffle!(y.1, x.1, [0, 5, 2, 7])) |
235 | 231 | }
|
236 | 232 | Lanes::CD => {
|
237 |
| - (x.0, y.1) |
| 233 | + (x.0, y.1) |
238 | 234 | }
|
239 | 235 | Lanes::BC => {
|
240 |
| - (vqtbx1q_u8(x.0.into_bits(), y.0.into_bits(), u8x16::new(16, 16, 16, 16, 4, 5, 6, 7, 16, 16, 16, 16, 12, 13, 14, 15).into_bits() ).into_bits(), |
241 |
| - vqtbx1q_u8(x.1.into_bits(), y.1.into_bits(), u8x16::new( 0, 1, 2, 3, 16, 16, 16, 16, 8, 9, 10, 11, 16, 16, 16, 16).into_bits() ).into_bits()) |
| 236 | + (shuffle!(y.0, x.0, [4, 1, 6, 3]), shuffle!(y.1, x.1, [0, 5, 2, 7])) |
242 | 237 | }
|
243 | 238 | Lanes::ABCD => {
|
244 | 239 | y
|
@@ -333,7 +328,7 @@ impl FieldElement2625x4 {
|
333 | 328 | use core::arch::aarch64::vget_high_u32;
|
334 | 329 | use core::arch::aarch64::vcombine_u32;
|
335 | 330 |
|
336 |
| - let c: (u32x4, u32x4) = (vqshlq_u32(v.0.into_bits(), shifts.0.into_bits()).into_bits(), |
| 331 | + let c: (u32x4, u32x4) = (vqshlq_u32(v.0.into_bits(), shifts.0.into_bits()).into_bits(), |
337 | 332 | vqshlq_u32(v.1.into_bits(), shifts.1.into_bits()).into_bits());
|
338 | 333 | (vcombine_u32(vget_high_u32(c.0.into_bits()), vget_low_u32(c.0.into_bits())).into_bits(),
|
339 | 334 | vcombine_u32(vget_high_u32(c.1.into_bits()), vget_low_u32(c.1.into_bits())).into_bits())
|
@@ -377,7 +372,7 @@ impl FieldElement2625x4 {
|
377 | 372 | use core::arch::aarch64::vmulq_n_u32;
|
378 | 373 | use core::arch::aarch64::vget_low_u32;
|
379 | 374 | use core::arch::aarch64::vcombine_u32;
|
380 |
| - |
| 375 | + |
381 | 376 | let c9_19_spread: (u32x4, u32x4) = (vmulq_n_u32(c98.0.into_bits(), 19).into_bits(), vmulq_n_u32(c98.1.into_bits(), 19).into_bits());
|
382 | 377 |
|
383 | 378 | (vcombine_u32(vget_low_u32(c9_19_spread.0.into_bits()), u32x2::splat(0).into_bits()).into_bits(),
|
@@ -423,9 +418,9 @@ impl FieldElement2625x4 {
|
423 | 418 | unsafe {
|
424 | 419 | use core::arch::aarch64::vmulq_n_u32;
|
425 | 420 |
|
426 |
| - c0 = (vmulq_n_u32(c0.0.into_bits(), 19).into_bits(), |
| 421 | + c0 = (vmulq_n_u32(c0.0.into_bits(), 19).into_bits(), |
427 | 422 | vmulq_n_u32(c0.1.into_bits(), 19).into_bits());
|
428 |
| - c1 = (vmulq_n_u32(c1.0.into_bits(), 19).into_bits(), |
| 423 | + c1 = (vmulq_n_u32(c1.0.into_bits(), 19).into_bits(), |
429 | 424 | vmulq_n_u32(c1.1.into_bits(), 19).into_bits());
|
430 | 425 | }
|
431 | 426 |
|
@@ -457,8 +452,8 @@ impl FieldElement2625x4 {
|
457 | 452 | #[inline(always)]
|
458 | 453 | fn m_lo(x: (u32x2, u32x2), y: (u32x2, u32x2)) -> (u32x2, u32x2) {
|
459 | 454 | use core::arch::aarch64::vmull_u32;
|
460 |
| - unsafe { |
461 |
| - let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), |
| 455 | + unsafe { |
| 456 | + let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), |
462 | 457 | vmull_u32(x.1.into_bits(), y.1.into_bits()).into_bits());
|
463 | 458 | (u32x2::new(x.0.extract(0), x.0.extract(2)), u32x2::new(x.1.extract(0), x.1.extract(2)))
|
464 | 459 | }
|
@@ -497,7 +492,7 @@ impl FieldElement2625x4 {
|
497 | 492 | let mut z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)) << 1);
|
498 | 493 | let mut z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)) << 1);
|
499 | 494 | let mut z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5);
|
500 |
| - |
| 495 | + |
501 | 496 |
|
502 | 497 | let low__p37 = u64x4::splat(0x3ffffed << 37);
|
503 | 498 | let even_p37 = u64x4::splat(0x3ffffff << 37);
|
@@ -609,8 +604,8 @@ impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 {
|
609 | 604 | #[inline(always)]
|
610 | 605 | fn m_lo(x: (u32x2, u32x2), y: (u32x2, u32x2)) -> (u32x2, u32x2) {
|
611 | 606 | use core::arch::aarch64::vmull_u32;
|
612 |
| - unsafe { |
613 |
| - let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), |
| 607 | + unsafe { |
| 608 | + let x: (u32x4, u32x4) = (vmull_u32(x.0.into_bits(), y.0.into_bits()).into_bits(), |
614 | 609 | vmull_u32(x.1.into_bits(), y.1.into_bits()).into_bits());
|
615 | 610 | (u32x2::new(x.0.extract(0), x.0.extract(2)), u32x2::new(x.1.extract(0), x.1.extract(2)))
|
616 | 611 | }
|
|
0 commit comments