|
| 1 | +/* SPDX-License-Identifier: BSD-3-Clause |
| 2 | + * Copyright(c) 2022 Intel Corporation |
| 3 | + */ |
| 4 | + |
| 5 | +#include <errno.h> |
| 6 | +#include <pthread.h> |
| 7 | +#include <stdbool.h> |
| 8 | +#include <stdio.h> |
| 9 | +#include <stdlib.h> |
| 10 | +#include <string.h> |
| 11 | +#include <unistd.h> |
| 12 | + |
| 13 | +#include "../sample/sample_util.h" |
| 14 | + |
| 15 | +static void fill_422_planar_le16(uint16_t* y, uint16_t* b, uint16_t* r, int w, int h) { |
| 16 | + int pg_size = w * h / 2; |
| 17 | + |
| 18 | + for (int pg = 0; pg < pg_size; pg++) { |
| 19 | + *b++ = (0 + pg * 4) << 6; |
| 20 | + *y++ = (1 + pg * 4) << 6; |
| 21 | + *r++ = (2 + pg * 4) << 6; |
| 22 | + *y++ = (3 + pg * 4) << 6; |
| 23 | + } |
| 24 | +} |
| 25 | + |
| 26 | +static int perf_cvt_planar_le16_to_422_10_pg2(mtl_handle st, int w, int h, int frames, |
| 27 | + int fb_cnt) { |
| 28 | + size_t fb_pg2_size = (size_t)w * h * 5 / 2; |
| 29 | + // mtl_udma_handle dma = mtl_udma_create(st, 128, MTL_PORT_P); |
| 30 | + struct st20_rfc4175_422_10_pg2_be* pg_be = |
| 31 | + (struct st20_rfc4175_422_10_pg2_be*)malloc(fb_pg2_size * fb_cnt); |
| 32 | + size_t planar_size = (size_t)w * h * 2 * sizeof(uint16_t); |
| 33 | + float planar_size_m = (float)planar_size / 1024 / 1024; |
| 34 | + uint16_t* p10_u16 = (uint16_t*)mtl_hp_malloc(st, planar_size * fb_cnt, MTL_PORT_P); |
| 35 | + uint16_t* p10_u16_b = p10_u16 + w * h; |
| 36 | + uint16_t* p10_u16_r = p10_u16 + w * h * 3 / 2; |
| 37 | + // mtl_iova_t p10_u16_y_iova = mtl_hp_virt2iova(st, p10_u16); |
| 38 | + // mtl_iova_t p10_u16_b_iova = p10_u16_y_iova + planar_size / 2; |
| 39 | + // mtl_iova_t p10_u16_r_iova = p10_u16_b_iova + planar_size / 4; |
| 40 | + // mtl_iova_t p10_u16_y_in_iova, p10_u16_b_in_iova, p10_u16_r_in_iova; |
| 41 | + enum mtl_simd_level cpu_level = mtl_get_simd_level(); |
| 42 | + |
| 43 | + struct st20_rfc4175_422_10_pg2_be* pg_be_out; |
| 44 | + uint16_t* p10_u16_in; |
| 45 | + uint16_t* p10_u16_b_in; |
| 46 | + uint16_t* p10_u16_r_in; |
| 47 | + |
| 48 | + for (int i = 0; i < fb_cnt; i++) { |
| 49 | + p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16)); |
| 50 | + p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b)); |
| 51 | + p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r)); |
| 52 | + fill_422_planar_le16(p10_u16_in, p10_u16_b_in, p10_u16_r_in, w, h); |
| 53 | + } |
| 54 | + |
| 55 | + clock_t start, end; |
| 56 | + float duration; |
| 57 | + |
| 58 | + start = clock(); |
| 59 | + for (int i = 0; i < frames; i++) { |
| 60 | + pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be)); |
| 61 | + p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16)); |
| 62 | + p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b)); |
| 63 | + p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r)); |
| 64 | + st20_yuv422p16le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in, |
| 65 | + pg_be_out, w, h, MTL_SIMD_LEVEL_NONE); |
| 66 | + } |
| 67 | + end = clock(); |
| 68 | + duration = (float)(end - start) / CLOCKS_PER_SEC; |
| 69 | + info("scalar, time: %f secs with %d frames(%dx%d,%fm@%d buffers)\n", duration, frames, |
| 70 | + w, h, planar_size_m, fb_cnt); |
| 71 | + |
| 72 | + if (cpu_level >= MTL_SIMD_LEVEL_AVX512) { |
| 73 | + start = clock(); |
| 74 | + for (int i = 0; i < frames; i++) { |
| 75 | + pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be)); |
| 76 | + p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16)); |
| 77 | + p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b)); |
| 78 | + p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r)); |
| 79 | + st20_yuv422p16le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in, |
| 80 | + pg_be_out, w, h, MTL_SIMD_LEVEL_AVX512); |
| 81 | + } |
| 82 | + end = clock(); |
| 83 | + float duration_simd = (float)(end - start) / CLOCKS_PER_SEC; |
| 84 | + info("avx512, time: %f secs with %d frames(%dx%d@%d buffers)\n", duration_simd, |
| 85 | + frames, w, h, fb_cnt); |
| 86 | + info("avx512, %fx performance to scalar\n", duration / duration_simd); |
| 87 | + /* if (dma) { |
| 88 | + start = clock(); |
| 89 | + for (int i = 0; i < frames; i++) { |
| 90 | + pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be)); |
| 91 | + p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16)); |
| 92 | + p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b)); |
| 93 | + p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r)); |
| 94 | + p10_u16_y_in_iova = p10_u16_y_iova + (i % fb_cnt) * (planar_size); |
| 95 | + p10_u16_b_in_iova = p10_u16_b_iova + (i % fb_cnt) * (planar_size); |
| 96 | + p10_u16_r_in_iova = p10_u16_r_iova + (i % fb_cnt) * (planar_size); |
| 97 | + st20_yuv422p10le_to_rfc4175_422be10_simd_dma( |
| 98 | + dma, p10_u16_in, p10_u16_y_in_iova, p10_u16_b_in, p10_u16_b_in_iova, |
| 99 | + p10_u16_r_in, p10_u16_r_in_iova, pg_be_out, w, h, MTL_SIMD_LEVEL_AVX512); |
| 100 | + } |
| 101 | + end = clock(); |
| 102 | + float duration_simd = (float)(end - start) / CLOCKS_PER_SEC; |
| 103 | + info("avx512+dma, time: %f secs with %d frames(%dx%d@%d buffers)\n", |
| 104 | + duration_simd, frames, w, h, fb_cnt); info("avx512+dma, %fx performance to |
| 105 | + scalar\n", duration / duration_simd); |
| 106 | + }*/ |
| 107 | + } |
| 108 | + /* |
| 109 | + if (cpu_level >= MTL_SIMD_LEVEL_AVX512_VBMI2) { |
| 110 | + start = clock(); |
| 111 | + for (int i = 0; i < frames; i++) { |
| 112 | + pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be)); |
| 113 | + p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16)); |
| 114 | + p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b)); |
| 115 | + p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r)); |
| 116 | + st20_yuv422p10le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in, |
| 117 | + pg_be_out, w, h, |
| 118 | + MTL_SIMD_LEVEL_AVX512_VBMI2); |
| 119 | + } |
| 120 | + end = clock(); |
| 121 | + float duration_vbmi = (float)(end - start) / CLOCKS_PER_SEC; |
| 122 | + info("avx512_vbmi, time: %f secs with %d frames(%dx%d@%d buffers)\n", duration_vbmi, |
| 123 | + frames, w, h, fb_cnt); |
| 124 | + info("avx512_vbmi, %fx performance to scalar\n", duration / duration_vbmi); |
| 125 | + if (dma) { |
| 126 | + start = clock(); |
| 127 | + for (int i = 0; i < frames; i++) { |
| 128 | + pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be)); |
| 129 | + p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16)); |
| 130 | + p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b)); |
| 131 | + p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r)); |
| 132 | + p10_u16_y_in_iova = p10_u16_y_iova + (i % fb_cnt) * (planar_size); |
| 133 | + p10_u16_b_in_iova = p10_u16_b_iova + (i % fb_cnt) * (planar_size); |
| 134 | + p10_u16_r_in_iova = p10_u16_r_iova + (i % fb_cnt) * (planar_size); |
| 135 | + st20_yuv422p10le_to_rfc4175_422be10_simd_dma( |
| 136 | + dma, p10_u16_in, p10_u16_y_in_iova, p10_u16_b_in, p10_u16_b_in_iova, |
| 137 | + p10_u16_r_in, p10_u16_r_in_iova, pg_be_out, w, h, |
| 138 | + MTL_SIMD_LEVEL_AVX512_VBMI2); |
| 139 | + } |
| 140 | + end = clock(); |
| 141 | + float duration_simd = (float)(end - start) / CLOCKS_PER_SEC; |
| 142 | + info("avx512_vbmi+dma, time: %f secs with %d frames(%dx%d@%d buffers)\n", |
| 143 | + duration_simd, frames, w, h, fb_cnt); |
| 144 | + info("avx512_vbmi+dma, %fx performance to scalar\n", duration / duration_simd); |
| 145 | + } |
| 146 | + } */ |
| 147 | + |
| 148 | + free(pg_be); |
| 149 | + mtl_hp_free(st, p10_u16); |
| 150 | + // if (dma) mtl_udma_free(dma); |
| 151 | + |
| 152 | + return 0; |
| 153 | +} |
| 154 | + |
| 155 | +static void* perf_thread(void* arg) { |
| 156 | + struct st_sample_context* ctx = arg; |
| 157 | + mtl_handle dev_handle = ctx->st; |
| 158 | + int frames = ctx->perf_frames; |
| 159 | + int fb_cnt = ctx->perf_fb_cnt; |
| 160 | + |
| 161 | + unsigned int lcore = 0; |
| 162 | + int ret = mtl_get_lcore(dev_handle, &lcore); |
| 163 | + if (ret < 0) { |
| 164 | + return NULL; |
| 165 | + } |
| 166 | + mtl_bind_to_lcore(dev_handle, pthread_self(), lcore); |
| 167 | + info("%s, run in lcore %u\n", __func__, lcore); |
| 168 | + |
| 169 | + perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 640, 480, frames, fb_cnt); |
| 170 | + perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1280, 720, frames, fb_cnt); |
| 171 | + perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920, 1080, frames, fb_cnt); |
| 172 | + perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920 * 2, 1080 * 2, frames, fb_cnt); |
| 173 | + perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920 * 4, 1080 * 4, frames, fb_cnt); |
| 174 | + |
| 175 | + mtl_put_lcore(dev_handle, lcore); |
| 176 | + |
| 177 | + return NULL; |
| 178 | +} |
| 179 | + |
| 180 | +int main(int argc, char** argv) { |
| 181 | + struct st_sample_context ctx; |
| 182 | + int ret; |
| 183 | + |
| 184 | + memset(&ctx, 0, sizeof(ctx)); |
| 185 | + ret = tx_sample_parse_args(&ctx, argc, argv); |
| 186 | + if (ret < 0) return ret; |
| 187 | + |
| 188 | + ctx.st = mtl_init(&ctx.param); |
| 189 | + if (!ctx.st) { |
| 190 | + err("%s: mtl_init fail\n", __func__); |
| 191 | + return -EIO; |
| 192 | + } |
| 193 | + |
| 194 | + pthread_t thread; |
| 195 | + ret = pthread_create(&thread, NULL, perf_thread, &ctx); |
| 196 | + if (ret) goto exit; |
| 197 | + pthread_join(thread, NULL); |
| 198 | + |
| 199 | +exit: |
| 200 | + /* release sample(st) dev */ |
| 201 | + if (ctx.st) { |
| 202 | + mtl_uninit(ctx.st); |
| 203 | + ctx.st = NULL; |
| 204 | + } |
| 205 | + return ret; |
| 206 | +} |
0 commit comments