Skip to content

Commit 0d0fe30

Browse files
committed
simd
1 parent 23962ec commit 0d0fe30

File tree

4 files changed

+193
-11
lines changed

4 files changed

+193
-11
lines changed

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ exclude = ["/.github/*", "*.ipynb", "./scripts/*", "examples/*", "tests/*", "dat
1515
[dependencies]
1616
image = "0.25.6"
1717
nalgebra = "0.33.2"
18+
pulp = { version = "0.21.5", features = ["macro"] }
1819
rayon = "1.10.0"
1920
serde = { version = "1.0.219", features = ["derive"] }
2021
serde_json = "1.0.140"
@@ -25,6 +26,9 @@ name = "remap"
2526
[[example]]
2627
name = "stereo_rectify"
2728

29+
[[example]]
30+
name = "project_points"
31+
2832
[[bench]]
2933
name = "bench"
3034
harness = false

benches/bench.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1+
use std::arch::aarch64::*;
2+
13
use camera_intrinsic_model::{compute_for_fast_remap, fast_remap, model_from_json, remap};
24
use diol::prelude::*;
35
use image::{DynamicImage, ImageReader};
6+
use nalgebra::{self as na};
7+
use rayon::{
8+
iter::{IndexedParallelIterator, ParallelIterator},
9+
slice::{ParallelSlice, ParallelSliceMut},
10+
};
411

512
fn main() -> eyre::Result<()> {
613
let bench = Bench::from_args()?;
@@ -14,10 +21,95 @@ fn main() -> eyre::Result<()> {
1421
],
1522
[None],
1623
);
24+
bench.register_many(
25+
"project points",
26+
list![
27+
bench_eucm_project_points.with_name("eucm"),
28+
bench_eucm_simd.with_name("eucm simd"),
29+
],
30+
[None],
31+
);
1732
bench.run()?;
1833
Ok(())
1934
}
2035

36+
#[inline(always)]
37+
fn bench_eucm_simd_impl(
38+
params: &na::DVector<f32>,
39+
p3ds: &[na::Vector3<f32>],
40+
) -> Vec<na::Vector2<f32>> {
41+
let fx = params[0];
42+
let fy = params[1];
43+
let cx = params[2];
44+
let cy = params[3];
45+
let alpha = params[4];
46+
let beta = params[5];
47+
48+
let fx = unsafe { vdupq_n_f32(fx) };
49+
let fy = unsafe { vdupq_n_f32(fy) };
50+
let cx = unsafe { vdupq_n_f32(cx) };
51+
let cy = unsafe { vdupq_n_f32(cy) };
52+
let alpha = unsafe { vdupq_n_f32(alpha) };
53+
let beta = unsafe { vdupq_n_f32(beta) };
54+
let one = unsafe { vdupq_n_f32(1.0) };
55+
let one_alpha = unsafe { vsubq_f32(one, alpha) };
56+
57+
let mut p2ds = vec![na::Vector2::<f32>::zeros(); p3ds.len()];
58+
let chunk_size = 4;
59+
p3ds.par_chunks(chunk_size)
60+
.zip(p2ds.par_chunks_mut(chunk_size))
61+
.for_each(|(p3dss, p2dss)| {
62+
let x = [p3dss[0].x, p3dss[1].x, p3dss[2].x, p3dss[3].x];
63+
let y = [p3dss[0].y, p3dss[1].y, p3dss[2].y, p3dss[3].y];
64+
let z = [p3dss[0].z, p3dss[1].z, p3dss[2].z, p3dss[3].z];
65+
let mut outx = [0.0f32; 4];
66+
let mut outy = [0.0f32; 4];
67+
unsafe {
68+
let x = vld1q_f32(x.as_ptr());
69+
let y = vld1q_f32(y.as_ptr());
70+
let z = vld1q_f32(z.as_ptr());
71+
72+
let xx = vmulq_f32(x, x);
73+
let yy = vmulq_f32(y, y);
74+
75+
let rho2 = vfmaq_f32(vmulq_f32(z, z), beta, vaddq_f32(xx, yy));
76+
let rho = vsqrtq_f32(rho2);
77+
78+
let ar = vmulq_f32(alpha, rho);
79+
let norm = vfmaq_f32(ar, one_alpha, z);
80+
let mx = vdivq_f32(x, norm);
81+
let my = vdivq_f32(y, norm);
82+
83+
let px = vfmaq_f32(cx, fx, mx);
84+
let py = vfmaq_f32(cy, fy, my);
85+
vst1q_f32(outx.as_mut_ptr(), px);
86+
vst1q_f32(outy.as_mut_ptr(), py);
87+
for i in 0..4 {
88+
p2dss[i].x = outx[i];
89+
p2dss[i].y = outy[i];
90+
}
91+
}
92+
});
93+
p2ds
94+
}
95+
fn bench_eucm_simd(bencher: Bencher, _dummy: Option<bool>) {
96+
let model1 = model_from_json("data/eucm0.json");
97+
let params = model1.params().cast::<f32>();
98+
let p3ds = vec![na::Vector3::new(0.5, 0.5, 2.0); 3856 * 176];
99+
bencher.bench(|| {
100+
let _p2ds = bench_eucm_simd_impl(&params, &p3ds);
101+
// std::hint::black_box(p2ds);
102+
});
103+
}
104+
105+
fn bench_eucm_project_points(bencher: Bencher, _dummy: Option<bool>) {
106+
let model1 = model_from_json("data/eucm0.json").cast::<f32>();
107+
let p3ds = vec![na::Vector3::new(0.5, 0.5, 2.0); 3856 * 176];
108+
bencher.bench(|| {
109+
let _p2ds = model1.project(&p3ds);
110+
});
111+
}
112+
21113
fn bench_remap(bencher: Bencher, _dummy: Option<bool>) {
22114
let model1 = model_from_json("data/eucm0.json");
23115
let new_w_h = 1024;

examples/project_points.rs

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
use camera_intrinsic_model::model_from_json;
2+
use nalgebra as na;
3+
use rand::random;
4+
use rayon::prelude::*;
5+
use std::arch::aarch64::*;
6+
7+
#[inline(always)]
8+
fn bench_eucm_simd_impl(
9+
params: &na::DVector<f32>,
10+
p3ds: &[na::Vector3<f32>],
11+
) -> Vec<na::Vector2<f32>> {
12+
let fx = params[0];
13+
let fy = params[1];
14+
let cx = params[2];
15+
let cy = params[3];
16+
let alpha = params[4];
17+
let beta = params[5];
18+
19+
let fx = unsafe { vdupq_n_f32(fx) };
20+
let fy = unsafe { vdupq_n_f32(fy) };
21+
let cx = unsafe { vdupq_n_f32(cx) };
22+
let cy = unsafe { vdupq_n_f32(cy) };
23+
let alpha = unsafe { vdupq_n_f32(alpha) };
24+
let beta = unsafe { vdupq_n_f32(beta) };
25+
let one = unsafe { vdupq_n_f32(1.0) };
26+
let one_alpha = unsafe { vsubq_f32(one, alpha) };
27+
28+
let mut p2ds = vec![na::Vector2::<f32>::zeros(); p3ds.len()];
29+
let chunk_size = 4;
30+
p3ds.par_chunks(chunk_size)
31+
.zip(p2ds.par_chunks_mut(chunk_size))
32+
.for_each(|(p3dss, p2dss)| {
33+
let x = [p3dss[0].x, p3dss[1].x, p3dss[2].x, p3dss[3].x];
34+
let y = [p3dss[0].y, p3dss[1].y, p3dss[2].y, p3dss[3].y];
35+
let z = [p3dss[0].z, p3dss[1].z, p3dss[2].z, p3dss[3].z];
36+
let mut outx = [0.0f32; 4];
37+
let mut outy = [0.0f32; 4];
38+
unsafe {
39+
let x = vld1q_f32(x.as_ptr());
40+
let y = vld1q_f32(y.as_ptr());
41+
let z = vld1q_f32(z.as_ptr());
42+
43+
let xx = vmulq_f32(x, x);
44+
let yy = vmulq_f32(y, y);
45+
46+
let rho2 = vfmaq_f32(vmulq_f32(z, z), beta, vaddq_f32(xx, yy));
47+
let rho = vsqrtq_f32(rho2);
48+
49+
let ar = vmulq_f32(alpha, rho);
50+
let norm = vfmaq_f32(ar, one_alpha, z);
51+
// let mx = vdivq_f32(x, norm);
52+
// let my = vdivq_f32(y, norm);
53+
let norm_recip = vrecpeq_f32(norm); // 初步估算 reciprocal
54+
let norm_recip = vmulq_f32(vrecpsq_f32(norm, norm_recip), norm_recip); // refine 1
55+
let mx = vmulq_f32(x, norm_recip);
56+
let my = vmulq_f32(y, norm_recip);
57+
58+
let px = vfmaq_f32(cx, fx, mx);
59+
let py = vfmaq_f32(cy, fy, my);
60+
vst1q_f32(outx.as_mut_ptr(), px);
61+
vst1q_f32(outy.as_mut_ptr(), py);
62+
p2dss[0].x = outx[0];
63+
p2dss[0].y = outy[0];
64+
p2dss[1].x = outx[1];
65+
p2dss[1].y = outy[1];
66+
p2dss[2].x = outx[2];
67+
p2dss[2].y = outy[2];
68+
p2dss[3].x = outx[3];
69+
p2dss[3].y = outy[3];
70+
}
71+
});
72+
p2ds
73+
}
74+
75+
fn main() {
76+
let model1 = model_from_json("data/eucm0.json").cast();
77+
let params = model1.params().cast::<f32>();
78+
79+
let mut p3ds = Vec::new();
80+
for _ in 0..1000 {
81+
p3ds.push(na::Vector3::new(rand::random(), random(), 1.0));
82+
}
83+
let p2ds0 = bench_eucm_simd_impl(&params, &p3ds);
84+
let p2ds1 = model1.project(&p3ds);
85+
p2ds0.iter().zip(p2ds1).for_each(|(p0, p1)| {
86+
if let Some(p1) = p1 {
87+
println!("p0\t{}\t{}", p0.x, p0.y);
88+
println!("p1\t{}\t{}\n", p0.x, p1.y);
89+
}
90+
});
91+
}

src/generic_functions.rs

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ pub fn init_undistort_map(
4545
.into_par_iter()
4646
.flat_map(|y| {
4747
(0..new_w_h.0)
48-
.into_iter()
4948
.map(|x| {
5049
rmat_inv * na::Vector3::new((x as f64 - cx) / fx, (y as f64 - cy) / fy, 1.0)
5150
})
@@ -70,10 +69,10 @@ pub fn init_undistort_map(
7069

7170
#[inline]
7271
fn interpolate_bilinear_weight(x: f32, y: f32) -> (u32, u32) {
73-
if x < 0.0 || x > 65535.0 {
72+
if !(0.0..=65535.0).contains(&x) {
7473
panic!("x not in [0-65535]");
7574
}
76-
if y < 0.0 || y > 65535.0 {
75+
if !(0.0..=65535.0).contains(&y) {
7776
panic!("y not in [0-65535]");
7877
}
7978
const UPPER: f32 = u8::MAX as f32;
@@ -125,10 +124,8 @@ pub fn fast_remap(
125124
let xw1 = 255 - xw0;
126125
let yw1 = 255 - yw0;
127126
const UPPER_UPPER: u32 = 255 * 255;
128-
let p =
129-
((p00 * xw0 * yw0 + p10 * xw1 * yw0 + p01 * xw0 * yw1 + p11 * xw1 * yw1)
130-
/ UPPER_UPPER) as u8;
131-
p
127+
((p00 * xw0 * yw0 + p10 * xw1 * yw0 + p01 * xw0 * yw1 + p11 * xw1 * yw1)
128+
/ UPPER_UPPER) as u8
132129
})
133130
.collect();
134131
let img = GrayImage::from_vec(new_w_h.0, new_w_h.1, val).unwrap();
@@ -171,10 +168,8 @@ pub fn fast_remap(
171168
let xw1 = 255 - xw0;
172169
let yw1 = 255 - yw0;
173170
const UPPER_UPPER: u32 = 255 * 255;
174-
let p =
175-
((p00 * xw0 * yw0 + p10 * xw1 * yw0 + p01 * xw0 * yw1 + p11 * xw1 * yw1)
176-
/ UPPER_UPPER) as u16;
177-
p
171+
((p00 * xw0 * yw0 + p10 * xw1 * yw0 + p01 * xw0 * yw1 + p11 * xw1 * yw1)
172+
/ UPPER_UPPER) as u16
178173
})
179174
.collect();
180175
let img = ImageBuffer::from_vec(new_w_h.0, new_w_h.1, val).unwrap();

0 commit comments

Comments
 (0)