OpenVisualCloud
diff --git a/‎app/meson.build
Lines changed: 15 additions & 0 deletions b/‎app/meson.build
Lines changed: 15 additions & 0 deletions
diff --git a/‎app/perf/meson.build
Lines changed: 2 additions & 0 deletions b/‎app/perf/meson.build
Lines changed: 2 additions & 0 deletions
diff --git a/‎app/perf/p16le_to_rfc4175_422be10.c
Lines changed: 206 additions & 0 deletions b/‎app/perf/p16le_to_rfc4175_422be10.c
Lines changed: 206 additions & 0 deletions
diff --git a/‎app/perf/perf_test.sh
Lines changed: 2 additions & 0 deletions b/‎app/perf/perf_test.sh
Lines changed: 2 additions & 0 deletions
@@ -130,6 +130,21 @@ executable('PerfP10LeToRfc4175422be10', perf_p10le_to_rfc4175_422be10_sources,
   dependencies: [asan_dep, mtl, libpthread, ws2_32_dep]
 )
 
+executable('PerfRfc4175422be10ToP16Le', perf_rfc4175_422be10_to_p16le_sources,
+  c_args : app_c_args,
+  link_args: app_ld_args,
+  # asan should be always the first dep
+  dependencies: [asan_dep, mtl, libpthread, ws2_32_dep]
+)
+
+executable('PerfP16LeToRfc4175422be10', perf_p16le_to_rfc4175_422be10_sources,
+  c_args : app_c_args,
+  link_args: app_ld_args,
+  # asan should be always the first dep
+  dependencies: [asan_dep, mtl, libpthread, ws2_32_dep]
+)
+
+
 executable('PerfRfc4175422be10ToLe', perf_rfc4175_422be10_to_le_sources,
   c_args : app_c_args,
   link_args: app_ld_args,
 
@@ -3,6 +3,8 @@
 
 perf_rfc4175_422be10_to_p10le_sources = files('rfc4175_422be10_to_p10le.c', '../sample/sample_util.c')
 perf_p10le_to_rfc4175_422be10_sources = files('p10le_to_rfc4175_422be10.c', '../sample/sample_util.c')
+perf_rfc4175_422be10_to_p16le_sources = files('rfc4175_422be10_to_p16le.c', '../sample/sample_util.c')
+perf_p16le_to_rfc4175_422be10_sources = files('p16le_to_rfc4175_422be10.c', '../sample/sample_util.c')
 perf_rfc4175_422be10_to_le_sources = files('rfc4175_422be10_to_le.c', '../sample/sample_util.c')
 perf_rfc4175_422le10_to_be_sources = files('rfc4175_422le10_to_be.c', '../sample/sample_util.c')
 perf_rfc4175_422be10_to_le8_sources = files('rfc4175_422be10_to_le8.c', '../sample/sample_util.c')
 
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../sample/sample_util.h"
+
+static void fill_422_planar_le16(uint16_t* y, uint16_t* b, uint16_t* r, int w, int h) {
+  int pg_size = w * h / 2;
+
+  for (int pg = 0; pg < pg_size; pg++) {
+    *b++ = (0 + pg * 4) << 6;
+    *y++ = (1 + pg * 4) << 6;
+    *r++ = (2 + pg * 4) << 6;
+    *y++ = (3 + pg * 4) << 6;
+  }
+}
+
+static int perf_cvt_planar_le16_to_422_10_pg2(mtl_handle st, int w, int h, int frames,
+                                              int fb_cnt) {
+  size_t fb_pg2_size = (size_t)w * h * 5 / 2;
+  //  mtl_udma_handle dma = mtl_udma_create(st, 128, MTL_PORT_P);
+  struct st20_rfc4175_422_10_pg2_be* pg_be =
+      (struct st20_rfc4175_422_10_pg2_be*)malloc(fb_pg2_size * fb_cnt);
+  size_t planar_size = (size_t)w * h * 2 * sizeof(uint16_t);
+  float planar_size_m = (float)planar_size / 1024 / 1024;
+  uint16_t* p10_u16 = (uint16_t*)mtl_hp_malloc(st, planar_size * fb_cnt, MTL_PORT_P);
+  uint16_t* p10_u16_b = p10_u16 + w * h;
+  uint16_t* p10_u16_r = p10_u16 + w * h * 3 / 2;
+  //  mtl_iova_t p10_u16_y_iova = mtl_hp_virt2iova(st, p10_u16);
+  //  mtl_iova_t p10_u16_b_iova = p10_u16_y_iova + planar_size / 2;
+  //  mtl_iova_t p10_u16_r_iova = p10_u16_b_iova + planar_size / 4;
+  //  mtl_iova_t p10_u16_y_in_iova, p10_u16_b_in_iova, p10_u16_r_in_iova;
+  enum mtl_simd_level cpu_level = mtl_get_simd_level();
+
+  struct st20_rfc4175_422_10_pg2_be* pg_be_out;
+  uint16_t* p10_u16_in;
+  uint16_t* p10_u16_b_in;
+  uint16_t* p10_u16_r_in;
+
+  for (int i = 0; i < fb_cnt; i++) {
+    p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
+    p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
+    p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
+    fill_422_planar_le16(p10_u16_in, p10_u16_b_in, p10_u16_r_in, w, h);
+  }
+
+  clock_t start, end;
+  float duration;
+
+  start = clock();
+  for (int i = 0; i < frames; i++) {
+    pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
+    p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
+    p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
+    p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
+    st20_yuv422p16le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in,
+                                             pg_be_out, w, h, MTL_SIMD_LEVEL_NONE);
+  }
+  end = clock();
+  duration = (float)(end - start) / CLOCKS_PER_SEC;
+  info("scalar, time: %f secs with %d frames(%dx%d,%fm@%d buffers)\n", duration, frames,
+       w, h, planar_size_m, fb_cnt);
+
+  if (cpu_level >= MTL_SIMD_LEVEL_AVX512) {
+    start = clock();
+    for (int i = 0; i < frames; i++) {
+      pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
+      p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
+      p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
+      p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
+      st20_yuv422p16le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in,
+                                               pg_be_out, w, h, MTL_SIMD_LEVEL_AVX512);
+    }
+    end = clock();
+    float duration_simd = (float)(end - start) / CLOCKS_PER_SEC;
+    info("avx512, time: %f secs with %d frames(%dx%d@%d buffers)\n", duration_simd,
+         frames, w, h, fb_cnt);
+    info("avx512, %fx performance to scalar\n", duration / duration_simd);
+    /*    if (dma) {
+          start = clock();
+          for (int i = 0; i < frames; i++) {
+            pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
+            p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
+            p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
+            p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
+            p10_u16_y_in_iova = p10_u16_y_iova + (i % fb_cnt) * (planar_size);
+            p10_u16_b_in_iova = p10_u16_b_iova + (i % fb_cnt) * (planar_size);
+            p10_u16_r_in_iova = p10_u16_r_iova + (i % fb_cnt) * (planar_size);
+            st20_yuv422p10le_to_rfc4175_422be10_simd_dma(
+                dma, p10_u16_in, p10_u16_y_in_iova, p10_u16_b_in, p10_u16_b_in_iova,
+                p10_u16_r_in, p10_u16_r_in_iova, pg_be_out, w, h, MTL_SIMD_LEVEL_AVX512);
+          }
+          end = clock();
+          float duration_simd = (float)(end - start) / CLOCKS_PER_SEC;
+          info("avx512+dma, time: %f secs with %d frames(%dx%d@%d buffers)\n",
+       duration_simd, frames, w, h, fb_cnt); info("avx512+dma, %fx performance to
+       scalar\n", duration / duration_simd);
+        }*/
+  }
+  /*
+    if (cpu_level >= MTL_SIMD_LEVEL_AVX512_VBMI2) {
+      start = clock();
+      for (int i = 0; i < frames; i++) {
+        pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
+        p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
+        p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
+        p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
+        st20_yuv422p10le_to_rfc4175_422be10_simd(p10_u16_in, p10_u16_b_in, p10_u16_r_in,
+                                                 pg_be_out, w, h,
+                                                 MTL_SIMD_LEVEL_AVX512_VBMI2);
+      }
+      end = clock();
+      float duration_vbmi = (float)(end - start) / CLOCKS_PER_SEC;
+      info("avx512_vbmi, time: %f secs with %d frames(%dx%d@%d buffers)\n", duration_vbmi,
+           frames, w, h, fb_cnt);
+      info("avx512_vbmi, %fx performance to scalar\n", duration / duration_vbmi);
+      if (dma) {
+        start = clock();
+        for (int i = 0; i < frames; i++) {
+          pg_be_out = pg_be + (i % fb_cnt) * (fb_pg2_size / sizeof(*pg_be));
+          p10_u16_in = p10_u16 + (i % fb_cnt) * (planar_size / sizeof(*p10_u16));
+          p10_u16_b_in = p10_u16_b + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_b));
+          p10_u16_r_in = p10_u16_r + (i % fb_cnt) * (planar_size / sizeof(*p10_u16_r));
+          p10_u16_y_in_iova = p10_u16_y_iova + (i % fb_cnt) * (planar_size);
+          p10_u16_b_in_iova = p10_u16_b_iova + (i % fb_cnt) * (planar_size);
+          p10_u16_r_in_iova = p10_u16_r_iova + (i % fb_cnt) * (planar_size);
+          st20_yuv422p10le_to_rfc4175_422be10_simd_dma(
+              dma, p10_u16_in, p10_u16_y_in_iova, p10_u16_b_in, p10_u16_b_in_iova,
+              p10_u16_r_in, p10_u16_r_in_iova, pg_be_out, w, h,
+              MTL_SIMD_LEVEL_AVX512_VBMI2);
+        }
+        end = clock();
+        float duration_simd = (float)(end - start) / CLOCKS_PER_SEC;
+        info("avx512_vbmi+dma, time: %f secs with %d frames(%dx%d@%d buffers)\n",
+             duration_simd, frames, w, h, fb_cnt);
+        info("avx512_vbmi+dma, %fx performance to scalar\n", duration / duration_simd);
+      }
+    } */
+
+  free(pg_be);
+  mtl_hp_free(st, p10_u16);
+  //  if (dma) mtl_udma_free(dma);
+
+  return 0;
+}
+
+static void* perf_thread(void* arg) {
+  struct st_sample_context* ctx = arg;
+  mtl_handle dev_handle = ctx->st;
+  int frames = ctx->perf_frames;
+  int fb_cnt = ctx->perf_fb_cnt;
+
+  unsigned int lcore = 0;
+  int ret = mtl_get_lcore(dev_handle, &lcore);
+  if (ret < 0) {
+    return NULL;
+  }
+  mtl_bind_to_lcore(dev_handle, pthread_self(), lcore);
+  info("%s, run in lcore %u\n", __func__, lcore);
+
+  perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 640, 480, frames, fb_cnt);
+  perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1280, 720, frames, fb_cnt);
+  perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920, 1080, frames, fb_cnt);
+  perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920 * 2, 1080 * 2, frames, fb_cnt);
+  perf_cvt_planar_le16_to_422_10_pg2(dev_handle, 1920 * 4, 1080 * 4, frames, fb_cnt);
+
+  mtl_put_lcore(dev_handle, lcore);
+
+  return NULL;
+}
+
+int main(int argc, char** argv) {
+  struct st_sample_context ctx;
+  int ret;
+
+  memset(&ctx, 0, sizeof(ctx));
+  ret = tx_sample_parse_args(&ctx, argc, argv);
+  if (ret < 0) return ret;
+
+  ctx.st = mtl_init(&ctx.param);
+  if (!ctx.st) {
+    err("%s: mtl_init fail\n", __func__);
+    return -EIO;
+  }
+
+  pthread_t thread;
+  ret = pthread_create(&thread, NULL, perf_thread, &ctx);
+  if (ret) goto exit;
+  pthread_join(thread, NULL);
+
+exit:
+  /* release sample(st) dev */
+  if (ctx.st) {
+    mtl_uninit(ctx.st);
+    ctx.st = NULL;
+  }
+  return ret;
+}
@@ -23,6 +23,8 @@ perf_func() {
 
 perf_func PerfRfc4175422be10ToP10Le
 perf_func PerfP10LeToRfc4175422be10
+perf_func PerfRfc4175422be10ToP16Le
+perf_func PerfP16LeToRfc4175422be10
 perf_func PerfRfc4175422be10ToLe
 perf_func PerfRfc4175422le10ToBe
 perf_func PerfRfc4175422be10ToLe8