Merge pull request #4 from kpouget/timeout

kpouget · web-flow · commit 8dcefbdc78a5 · 2025-07-07T12:21:33.000+02:00
remoting: improve the loading timeout
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -83,29 +83,36 @@ struct timer_data {
 extern struct timer_data graph_compute_timer;
 extern struct timer_data get_tensor_timer;
 extern struct timer_data set_tensor_timer;
+extern struct timer_data wait_host_reply_timer;
+extern struct timer_data get_tensor_from_ptr_timer;
+extern struct timer_data set_tensor_from_ptr_timer;
 
 static inline void start_timer(struct timer_data *timer) {
   struct timespec ts;
   clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
   timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
 }
 
-static inline void stop_timer(struct timer_data *timer) {
+// returns the duration in ns
+static inline long long stop_timer(struct timer_data *timer) {
   struct timespec ts;
   clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
   long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
 
-  timer->total += (timer_end - timer->start);
+  long long duration = (timer_end - timer->start);
+  timer->total += duration;
   timer->count += 1;
+
+  return duration;
 }
 
 static inline void show_timer(struct timer_data *timer) {
   double ms = timer->total/1000000;
   double itl = ms/timer->count;
   double speed = 1/itl * 1000;
 
-  INFO("%14s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s",
-       timer->name, ms, timer->count, itl, speed);
+  INFO("%15s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s (%4.2f ms/call)",
+       timer->name, ms, timer->count, itl, speed, ms/timer->count);
 }
 
 static const char *apir_backend_initialize_error(int code) {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -121,6 +121,12 @@ static void showTime() {
   show_timer(&graph_compute_timer);
   show_timer(&get_tensor_timer);
   show_timer(&set_tensor_timer);
+  show_timer(&wait_host_reply_timer);
+
+  if (get_tensor_from_ptr_timer.count) {
+    show_timer(&get_tensor_from_ptr_timer);
+    show_timer(&set_tensor_from_ptr_timer);
+  }
 }
 
 ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -34,6 +34,8 @@ void breakpoint();
 #ifndef NDEBUG
 inline void
 INFO(const char *format, ...) {
+  fprintf(stderr, "INFO: ");
+
   va_list argptr;
   va_start(argptr, format);
   vfprintf(stderr, format, argptr);
@@ -56,6 +58,17 @@ WARNING(const char *format, ...) {
   va_end(argptr);
 }
 
+inline void
+ERROR(const char *format, ...) {
+  fprintf(stderr, "ERROR: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
 inline void
 FATAL(const char *format, ...) {
   fprintf(stderr, "FATAL: ");
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -26,6 +26,8 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu,
 static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
 static void virtgpu_init_renderer_info(struct virtgpu *gpu);
 
+struct timer_data wait_host_reply_timer = {0, 0, 0, "wait_host_reply"};
+
 static inline void
 virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
 {
@@ -98,8 +100,10 @@ create_virtgpu() {
   if (!encoder) {
     FATAL("%s: failed to prepare the remote call encoder :/", __func__);
   }
-  const uint64_t MAX_WAIT_US = 500000; // 5s (some conversions are wrong down the stack)
-  decoder = remote_call(gpu, encoder, MAX_WAIT_US);
+
+  const uint64_t MAX_WAIT_MS = 3000;
+  decoder = remote_call(gpu, encoder, MAX_WAIT_MS);
+
   if (!decoder) {
     FATAL("%s: failed to initialize the API remoting libraries. :/", __func__);
     return NULL;
@@ -447,7 +451,7 @@ struct vn_cs_decoder *
 remote_call(
   struct virtgpu *gpu,
   struct vn_cs_encoder *encoder,
-  uint64_t max_us
+  float max_wait_ms
   )
 {
   /*
@@ -483,18 +487,51 @@ remote_call(
   if (ret != 0) {
     FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret);
   }
+
   /*
    * Wait for the response notification
    */
 
-  uint32_t total_us = 0;
+  start_timer(&wait_host_reply_timer);
+
+  struct timespec ts_start, ts_end;
+  clock_gettime(CLOCK_MONOTONIC, &ts_start);
+  long long start_time = (long long)ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec;
+
+  bool timeout = false;
   while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) {
     int64_t base_sleep_us = 15;
 
     os_time_sleep(base_sleep_us);
-    total_us += base_sleep_us;
-    if (max_us && total_us > max_us) {
-      WARNING("%s: timed out waiting for the API remoting answer...", __func__);
+
+    if (max_wait_ms) {
+      clock_gettime(CLOCK_MONOTONIC, &ts_end);
+      long long end_time = (long long)ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec;
+      float duration_ms = (end_time - start_time) / 1000000;
+
+      if (duration_ms > max_wait_ms) {
+        timeout = true;
+        break;
+      }
+    }
+  }
+
+  long long duration_ns = stop_timer(&wait_host_reply_timer);
+
+  if (max_wait_ms) {
+    double duration_ms = (double) duration_ns / 1e6;  // 1 millisecond = 1e6 nanoseconds
+    double duration_s  = (double) duration_ns / 1e9;  // 1 second = 1e9 nanoseconds
+
+    if (duration_s > 1) {
+      INFO("%s: waited %.2fs for the host reply...", __func__, duration_s);
+    } else if (duration_ms > 1) {
+      INFO("%s: waited %.2fms for the host reply...", __func__, duration_ms);
+    } else {
+      INFO("%s: waited %lldns for the host reply...", __func__, duration_ns);
+    }
+
+    if (timeout) {
+      ERROR("timeout waiting for the host answer...");
       return NULL;
     }
   }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -113,5 +113,5 @@ struct vn_cs_encoder *remote_call_prepare(
   struct virtgpu *gpu,
   int32_t cmd_type,
   int32_t cmd_flags);
-struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc, uint64_t max_ns);
+struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc, float max_wait_ms);
 int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);