Skip to content

Commit ee47fc7

Browse files
author
desaulov
committed
support for FRAME_FMT_YUV422PLANAR16LE with AVX512
1 parent 4706531 commit ee47fc7

File tree

13 files changed

+687
-40
lines changed

13 files changed

+687
-40
lines changed

app/meson.build

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,21 @@ executable('PerfP10LeToRfc4175422be10', perf_p10le_to_rfc4175_422be10_sources,
130130
dependencies: [asan_dep, mtl, libpthread, ws2_32_dep]
131131
)
132132

133+
executable('PerfRfc4175422be10ToP16Le', perf_rfc4175_422be10_to_p16le_sources,
134+
c_args : app_c_args,
135+
link_args: app_ld_args,
136+
# asan should be always the first dep
137+
dependencies: [asan_dep, mtl, libpthread, ws2_32_dep]
138+
)
139+
140+
executable('PerfP16LeToRfc4175422be10', perf_p16le_to_rfc4175_422be10_sources,
141+
c_args : app_c_args,
142+
link_args: app_ld_args,
143+
# asan should be always the first dep
144+
dependencies: [asan_dep, mtl, libpthread, ws2_32_dep]
145+
)
146+
147+
133148
executable('PerfRfc4175422be10ToLe', perf_rfc4175_422be10_to_le_sources,
134149
c_args : app_c_args,
135150
link_args: app_ld_args,

app/perf/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
perf_rfc4175_422be10_to_p10le_sources = files('rfc4175_422be10_to_p10le.c', '../sample/sample_util.c')
55
perf_p10le_to_rfc4175_422be10_sources = files('p10le_to_rfc4175_422be10.c', '../sample/sample_util.c')
6+
perf_rfc4175_422be10_to_p16le_sources = files('rfc4175_422be10_to_p16le.c', '../sample/sample_util.c')
7+
perf_p16le_to_rfc4175_422be10_sources = files('p16le_to_rfc4175_422be10.c', '../sample/sample_util.c')
68
perf_rfc4175_422be10_to_le_sources = files('rfc4175_422be10_to_le.c', '../sample/sample_util.c')
79
perf_rfc4175_422le10_to_be_sources = files('rfc4175_422le10_to_be.c', '../sample/sample_util.c')
810
perf_rfc4175_422be10_to_le8_sources = files('rfc4175_422be10_to_le8.c', '../sample/sample_util.c')

app/perf/p16le_to_rfc4175_422be10.c

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/* SPDX-License-Identifier: BSD-3-Clause
2+
* Copyright(c) 2022 Intel Corporation
3+
*/
4+
5+
#include <errno.h>
6+
#include <pthread.h>
7+
#include <stdbool.h>
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
#include <string.h>
11+
#include <unistd.h>
12+
13+
#include "../sample/sample_util.h"
14+
15+
static void fill_422_planar_le16(uint16_t* y, uint16_t* b, uint16_t* r, int w, int h) {
16+
int pg_size = w * h / 2;
17+
18+
for (int pg = 0; pg < pg_size; pg++) {
19+
*b++ = (0 + pg * 4) << 6;
20+
*y++ = (1 + pg * 4) << 6;
21+
*r++ = (2 + pg * 4) << 6;
22+
*y++ = (3 + pg * 4) << 6;
23+
}
24+
}
25+
26+
static int perf_cvt_planar_le16_to_422_10_pg2(mtl_handle st, int w, int h, int frames,
27+
int fb_cnt) {
28+
size_t fb_pg2_size = (size_t)w * h * 5 / 2;
29+
// mtl_udma_handle dma = mtl_udma_create(st, 128, MTL_PORT_P);
30+
struct st20_rfc4175_422_10_pg2_be* pg_be =
31+
(struct st20_rfc4175_422_10_pg2_be*)malloc(fb_pg2_size * fb_cnt);
32+
size_t planar_size = (size_t)w * h * 2 * sizeof(uint16_t);
33+
float planar_size_m = (float)planar_size / 1024 / 1024;
34+
uint16_t* p10_u16 = (uint16_t*)mtl_hp_malloc(st, planar_size * fb_cnt, MTL_PORT_P);
35+
uint16_t* p10_u16_b = p10_u16 + w * h;
36+
uint16_t* p10_u16_r = p10_u16 + w * h * 3 / 2;
37+
// mtl_iova_t p10_u16_y_iova = mtl_hp_virt2iova(st, p10_u16);
38+
// mtl_iova_t p10_u16_b_iova = p10_u16_y_iova + planar_size / 2;
39+
// mtl_iova_t p10_u16_r_iova = p10_u16_b_iova + planar_size / 4;
40+
// mtl_iova_t p10_u16_y_in_iova, p10_u16_b_in_iova, p10_u16_r_in_iova;
41+
enum mtl_simd_level cpu_level = mtl_get_simd_level();
42+
43+
struct st20_rfc4175_422_10_pg2_be* pg_be_out;
44+
uint16_t* p10_u16_in;
45+
uint16_t* p10_u16_b_in;
46+
uint16_t* p10_u16_r_in;
47+
48+
for (int i = 0; i < fb_cnt; i++) {
49+
p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
50+
p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
51+
p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
52+
fill_422_planar_le16(p10_u16_in, p10_u16_b_in, p10_u16_r_in, w, h);
53+
}
54+
55+
clock_t start, end;
56+
float duration;
57+
58+
start = clock();
59+
for (int i = 0; i < frames; i++) {
60+
pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
61+
p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
62+
p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
63+
p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
64+
st20_yuv422p16le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in,
65+
pg_be_out, w, h, MTL_SIMD_LEVEL_NONE);
66+
}
67+
end = clock();
68+
duration = (float)(end - start) / CLOCKS_PER_SEC;
69+
info("scalar, time: %f secs with %d frames(%dx%d,%fm@%d buffers)\n", duration, frames,
70+
w, h, planar_size_m, fb_cnt);
71+
72+
if (cpu_level >= MTL_SIMD_LEVEL_AVX512) {
73+
start = clock();
74+
for (int i = 0; i < frames; i++) {
75+
pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
76+
p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
77+
p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
78+
p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
79+
st20_yuv422p16le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in,
80+
pg_be_out, w, h, MTL_SIMD_LEVEL_AVX512);
81+
}
82+
end = clock();
83+
float duration_simd = (float)(end - start) / CLOCKS_PER_SEC;
84+
info("avx512, time: %f secs with %d frames(%dx%d@%d buffers)\n", duration_simd,
85+
frames, w, h, fb_cnt);
86+
info("avx512, %fx performance to scalar\n", duration / duration_simd);
87+
/* if (dma) {
88+
start = clock();
89+
for (int i = 0; i < frames; i++) {
90+
pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
91+
p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
92+
p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
93+
p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
94+
p10_u16_y_in_iova = p10_u16_y_iova + (i % fb_cnt) * (planar_size);
95+
p10_u16_b_in_iova = p10_u16_b_iova + (i % fb_cnt) * (planar_size);
96+
p10_u16_r_in_iova = p10_u16_r_iova + (i % fb_cnt) * (planar_size);
97+
st20_yuv422p10le_to_rfc4175_422be10_simd_dma(
98+
dma, p10_u16_in, p10_u16_y_in_iova, p10_u16_b_in, p10_u16_b_in_iova,
99+
p10_u16_r_in, p10_u16_r_in_iova, pg_be_out, w, h, MTL_SIMD_LEVEL_AVX512);
100+
}
101+
end = clock();
102+
float duration_simd = (float)(end - start) / CLOCKS_PER_SEC;
103+
info("avx512+dma, time: %f secs with %d frames(%dx%d@%d buffers)\n",
104+
duration_simd, frames, w, h, fb_cnt); info("avx512+dma, %fx performance to
105+
scalar\n", duration / duration_simd);
106+
}*/
107+
}
108+
/*
109+
if (cpu_level >= MTL_SIMD_LEVEL_AVX512_VBMI2) {
110+
start = clock();
111+
for (int i = 0; i < frames; i++) {
112+
pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
113+
p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
114+
p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
115+
p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
116+
st20_yuv422p10le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in,
117+
pg_be_out, w, h,
118+
MTL_SIMD_LEVEL_AVX512_VBMI2);
119+
}
120+
end = clock();
121+
float duration_vbmi = (float)(end - start) / CLOCKS_PER_SEC;
122+
info("avx512_vbmi, time: %f secs with %d frames(%dx%d@%d buffers)\n", duration_vbmi,
123+
frames, w, h, fb_cnt);
124+
info("avx512_vbmi, %fx performance to scalar\n", duration / duration_vbmi);
125+
if (dma) {
126+
start = clock();
127+
for (int i = 0; i < frames; i++) {
128+
pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
129+
p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
130+
p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
131+
p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
132+
p10_u16_y_in_iova = p10_u16_y_iova + (i % fb_cnt) * (planar_size);
133+
p10_u16_b_in_iova = p10_u16_b_iova + (i % fb_cnt) * (planar_size);
134+
p10_u16_r_in_iova = p10_u16_r_iova + (i % fb_cnt) * (planar_size);
135+
st20_yuv422p10le_to_rfc4175_422be10_simd_dma(
136+
dma, p10_u16_in, p10_u16_y_in_iova, p10_u16_b_in, p10_u16_b_in_iova,
137+
p10_u16_r_in, p10_u16_r_in_iova, pg_be_out, w, h,
138+
MTL_SIMD_LEVEL_AVX512_VBMI2);
139+
}
140+
end = clock();
141+
float duration_simd = (float)(end - start) / CLOCKS_PER_SEC;
142+
info("avx512_vbmi+dma, time: %f secs with %d frames(%dx%d@%d buffers)\n",
143+
duration_simd, frames, w, h, fb_cnt);
144+
info("avx512_vbmi+dma, %fx performance to scalar\n", duration / duration_simd);
145+
}
146+
} */
147+
148+
free(pg_be);
149+
mtl_hp_free(st, p10_u16);
150+
// if (dma) mtl_udma_free(dma);
151+
152+
return 0;
153+
}
154+
155+
static void* perf_thread(void* arg) {
156+
struct st_sample_context* ctx = arg;
157+
mtl_handle dev_handle = ctx->st;
158+
int frames = ctx->perf_frames;
159+
int fb_cnt = ctx->perf_fb_cnt;
160+
161+
unsigned int lcore = 0;
162+
int ret = mtl_get_lcore(dev_handle, &lcore);
163+
if (ret < 0) {
164+
return NULL;
165+
}
166+
mtl_bind_to_lcore(dev_handle, pthread_self(), lcore);
167+
info("%s, run in lcore %u\n", __func__, lcore);
168+
169+
perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 640, 480, frames, fb_cnt);
170+
perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1280, 720, frames, fb_cnt);
171+
perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920, 1080, frames, fb_cnt);
172+
perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920 * 2, 1080 * 2, frames, fb_cnt);
173+
perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920 * 4, 1080 * 4, frames, fb_cnt);
174+
175+
mtl_put_lcore(dev_handle, lcore);
176+
177+
return NULL;
178+
}
179+
180+
int main(int argc, char** argv) {
181+
struct st_sample_context ctx;
182+
int ret;
183+
184+
memset(&ctx, 0, sizeof(ctx));
185+
ret = tx_sample_parse_args(&ctx, argc, argv);
186+
if (ret < 0) return ret;
187+
188+
ctx.st = mtl_init(&ctx.param);
189+
if (!ctx.st) {
190+
err("%s: mtl_init fail\n", __func__);
191+
return -EIO;
192+
}
193+
194+
pthread_t thread;
195+
ret = pthread_create(&thread, NULL, perf_thread, &ctx);
196+
if (ret) goto exit;
197+
pthread_join(thread, NULL);
198+
199+
exit:
200+
/* release sample(st) dev */
201+
if (ctx.st) {
202+
mtl_uninit(ctx.st);
203+
ctx.st = NULL;
204+
}
205+
return ret;
206+
}

app/perf/perf_test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ perf_func() {
2323

2424
perf_func PerfRfc4175422be10ToP10Le
2525
perf_func PerfP10LeToRfc4175422be10
26+
perf_func PerfRfc4175422be10ToP16Le
27+
perf_func PerfP16LeToRfc4175422be10
2628
perf_func PerfRfc4175422be10ToLe
2729
perf_func PerfRfc4175422le10ToBe
2830
perf_func PerfRfc4175422be10ToLe8

0 commit comments

Comments
 (0)