YOLOv4 -> YOLOv7-tiny_Head

PINTO0309 · PINTO0309 · commit 0438411e8d97 · 2022-08-16T11:16:22.000+09:00
diff --git a/demo_video.py b/demo_video.py
@@ -2,10 +2,212 @@
 import cv2
 import time
 import math
+import copy
 import argparse
 import onnxruntime
 import numpy as np
 from math import cos, sin
+from typing import Tuple, Optional, List
+
+
+class YOLOv7ONNX(object):
+    def __init__(
+        self,
+        model_path: Optional[str] = 'yolov7_tiny_head_0.768_post_480x640.onnx',
+        class_score_th: Optional[float] = 0.20,
+        providers: Optional[List] = [
+            (
+                'TensorrtExecutionProvider', {
+                    'trt_engine_cache_enable': True,
+                    'trt_engine_cache_path': '.',
+                    'trt_fp16_enable': True,
+                }
+            ),
+            'CUDAExecutionProvider',
+            'CPUExecutionProvider',
+        ],
+    ):
+        """YOLOv7ONNX
+        Parameters
+        ----------
+        model_path: Optional[str]
+            ONNX file path for YOLOv7
+        class_score_th: Optional[float]
+        class_score_th: Optional[float]
+            Score threshold. Default: 0.30
+        providers: Optional[List]
+            Name of onnx execution providers
+            Default:
+            [
+                (
+                    'TensorrtExecutionProvider', {
+                        'trt_engine_cache_enable': True,
+                        'trt_engine_cache_path': '.',
+                        'trt_fp16_enable': True,
+                    }
+                ),
+                'CUDAExecutionProvider',
+                'CPUExecutionProvider',
+            ]
+        """
+        # Threshold
+        self.class_score_th = class_score_th
+
+        # Model loading
+        session_option = onnxruntime.SessionOptions()
+        session_option.log_severity_level = 3
+        self.onnx_session = onnxruntime.InferenceSession(
+            model_path,
+            sess_options=session_option,
+            providers=providers,
+        )
+        self.providers = self.onnx_session.get_providers()
+
+        self.input_shapes = [
+            input.shape for input in self.onnx_session.get_inputs()
+        ]
+        self.input_names = [
+            input.name for input in self.onnx_session.get_inputs()
+        ]
+        self.output_names = [
+            output.name for output in self.onnx_session.get_outputs()
+        ]
+
+
+    def __call__(
+        self,
+        image: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """YOLOv7ONNX
+        Parameters
+        ----------
+        image: np.ndarray
+            Entire image
+        Returns
+        -------
+        face_boxes: np.ndarray
+            Predicted face boxes: [facecount, y1, x1, y2, x2]
+        face_scores: np.ndarray
+            Predicted face box scores: [facecount, score]
+        """
+        temp_image = copy.deepcopy(image)
+
+        # PreProcess
+        resized_image = self.__preprocess(
+            temp_image,
+        )
+
+        # Inference
+        inferece_image = np.asarray([resized_image], dtype=np.float32)
+        scores, boxes = self.onnx_session.run(
+            self.output_names,
+            {input_name: inferece_image for input_name in self.input_names},
+        )
+
+        # PostProcess
+        face_boxes, face_scores = self.__postprocess(
+            image=temp_image,
+            scores=scores,
+            boxes=boxes,
+        )
+
+        return face_boxes, face_scores
+
+
+    def __preprocess(
+        self,
+        image: np.ndarray,
+        swap: Optional[Tuple[int,int,int]] = (2, 0, 1),
+    ) -> np.ndarray:
+        """__preprocess
+        Parameters
+        ----------
+        image: np.ndarray
+            Entire image
+        swap: tuple
+            HWC to CHW: (2,0,1)
+            CHW to HWC: (1,2,0)
+            HWC to HWC: (0,1,2)
+            CHW to CHW: (0,1,2)
+        Returns
+        -------
+        resized_image: np.ndarray
+            Resized and normalized image.
+        """
+        # Normalization + BGR->RGB
+        resized_image = cv2.resize(
+            image,
+            (
+                int(self.input_shapes[0][3]),
+                int(self.input_shapes[0][2]),
+            )
+        )
+        resized_image = np.divide(resized_image, 255.0)
+        resized_image = resized_image[..., ::-1]
+        resized_image = resized_image.transpose(swap)
+        resized_image = np.ascontiguousarray(
+            resized_image,
+            dtype=np.float32,
+        )
+        return resized_image
+
+
+    def __postprocess(
+        self,
+        image: np.ndarray,
+        scores: np.ndarray,
+        boxes: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """__postprocess
+        Parameters
+        ----------
+        image: np.ndarray
+            Entire image.
+        scores: np.ndarray
+            float32[N, 1]
+        boxes: np.ndarray
+            int64[N, 6]
+        Returns
+        -------
+        faceboxes: np.ndarray
+            Predicted face boxes: [facecount, y1, x1, y2, x2]
+        facescores: np.ndarray
+            Predicted face box confs: [facecount, score]
+        """
+        image_height = image.shape[0]
+        image_width = image.shape[1]
+
+        """
+        Head Detector is
+            N -> Number of boxes detected
+            batchno -> always 0: BatchNo.0
+            classid -> always 0: "Head"
+        scores: float32[N,1],
+        batchno_classid_y1x1y2x2: int64[N,6],
+        """
+        scores = scores
+        keep_idxs = scores[:, 0] > self.class_score_th
+        scores_keep = scores[keep_idxs, :]
+        boxes_keep = boxes[keep_idxs, :]
+        faceboxes = []
+        facescores = []
+
+        if len(boxes_keep) > 0:
+            for box, score in zip(boxes_keep, scores_keep):
+                x_min = max(int(box[3]), 0)
+                y_min = max(int(box[2]), 0)
+                x_max = min(int(box[5]), image_width)
+                y_max = min(int(box[4]), image_height)
+
+                faceboxes.append(
+                    [x_min, y_min, x_max, y_max]
+                )
+                facescores.append(
+                    score
+                )
+
+        return np.asarray(faceboxes), np.asarray(facescores)
+
 
 
 def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size=100):
@@ -39,24 +241,10 @@ def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size=100):
 
 
 def main(args):
-    # YOLOv4-Head
-    yolov4_head = onnxruntime.InferenceSession(
-        path_or_bytes=f'yolov4_headdetection_480x640_post.onnx',
-        providers=[
-            (
-                'TensorrtExecutionProvider', {
-                    'trt_engine_cache_enable': True,
-                    'trt_engine_cache_path': '.',
-                    'trt_fp16_enable': True,
-                }
-            ),
-            'CUDAExecutionProvider',
-            'CPUExecutionProvider',
-        ]
+    # YOLOv7_tiny_Head
+    yolov7_head = YOLOv7ONNX(
+        class_score_th=0.20,
     )
-    yolov4_head_input_name = yolov4_head.get_inputs()[0].name
-    yolov4_head_H = yolov4_head.get_inputs()[0].shape[2]
-    yolov4_head_W = yolov4_head.get_inputs()[0].shape[3]
 
     # DMHead
     model_file_path = ''
@@ -98,44 +286,34 @@ def main(args):
     cv2.namedWindow(WINDOWS_NAME, cv2.WINDOW_NORMAL)
     cv2.resizeWindow(WINDOWS_NAME, cap_width, cap_height)
 
-    while True:
-        start = time.time()
+    cap_fps = cap.get(cv2.CAP_PROP_FPS)
+    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    video_writer = cv2.VideoWriter(
+        filename='output.mp4',
+        fourcc=fourcc,
+        fps=cap_fps,
+        frameSize=(w, h),
+    )
 
+    while True:
         ret, frame = cap.read()
         if not ret:
-            continue
-
-        # ============================================================= YOLOv4
-        # Resize
-        resized_frame = cv2.resize(frame, (yolov4_head_W, yolov4_head_H))
-        # BGR to RGB
-        rgb = resized_frame[..., ::-1]
-        # HWC -> CHW
-        chw = rgb.transpose(2, 0, 1)
-        # normalize to [0, 1] interval
-        chw = np.asarray(chw / 255., dtype=np.float32)
-        # hwc --> nhwc
-        nchw = chw[np.newaxis, ...]
-        # Inference YOLOv4
-        heads = yolov4_head.run(
-            None,
-            input_feed = {yolov4_head_input_name: nchw}
-        )[0]
-
-        canvas = resized_frame.copy()
+            break
+
+        start = time.time()
+
+        # ============================================================= YOLOv7_tiny_Head
+        heads, head_scores = yolov7_head(frame)
+
+        canvas = copy.deepcopy(frame)
         # ============================================================= DMHead
         croped_resized_frame = None
-        scores = heads[:,4]
-        keep_idxs = scores > 0.6
-        heads = heads[keep_idxs, :]
 
         if len(heads) > 0:
             dmhead_inputs = []
             dmhead_position = []
-            heads[:, 0] = heads[:, 0] * cap_width
-            heads[:, 1] = heads[:, 1] * cap_height
-            heads[:, 2] = heads[:, 2] * cap_width
-            heads[:, 3] = heads[:, 3] * cap_height
 
             for head in heads:
                 x_min = int(head[0])
@@ -145,11 +323,11 @@ def main(args):
 
                 # enlarge the bbox to include more background margin
                 y_min = max(0, y_min - abs(y_min - y_max) / 10)
-                y_max = min(resized_frame.shape[0], y_max + abs(y_min - y_max) / 10)
+                y_max = min(frame.shape[0], y_max + abs(y_min - y_max) / 10)
                 x_min = max(0, x_min - abs(x_min - x_max) / 5)
-                x_max = min(resized_frame.shape[1], x_max + abs(x_min - x_max) / 5)
-                x_max = min(x_max, resized_frame.shape[1])
-                croped_frame = resized_frame[int(y_min):int(y_max), int(x_min):int(x_max)]
+                x_max = min(frame.shape[1], x_max + abs(x_min - x_max) / 5)
+                x_max = min(x_max, frame.shape[1])
+                croped_frame = frame[int(y_min):int(y_max), int(x_min):int(x_max)]
 
                 # h,w -> 224,224
                 croped_resized_frame = cv2.resize(croped_frame, (dmhead_W, dmhead_H))
@@ -227,35 +405,43 @@ def main(args):
                     1
                 )
 
-            time_txt = f'{(time.time()-start)*1000:.2f} ms'
-            cv2.putText(
-                canvas,
-                time_txt,
-                (20, 35),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                1,
-                (255, 255, 255),
-                2,
-                cv2.LINE_AA,
-            )
-            cv2.putText(
-                canvas,
-                time_txt,
-                (20, 35),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                1,
-                (0, 0, 255),
-                1,
-                cv2.LINE_AA,
-            )
+        time_txt = f'{(time.time()-start)*1000:.2f} ms (inference+post-process)'
+        cv2.putText(
+            canvas,
+            time_txt,
+            (20, 35),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (255, 255, 255),
+            2,
+            cv2.LINE_AA,
+        )
+        cv2.putText(
+            canvas,
+            time_txt,
+            (20, 35),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (0, 255, 0),
+            1,
+            cv2.LINE_AA,
+        )
 
         key = cv2.waitKey(1)
         if key == 27:  # ESC
             break
 
         cv2.imshow(WINDOWS_NAME, canvas)
+        video_writer.write(canvas)
+
     cv2.destroyAllWindows()
 
+    if video_writer:
+        video_writer.release()
+
+    if cap:
+        cap.release()
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -278,7 +464,9 @@ def main(args):
             'mask',
             'nomask',
         ],
-        help='Select either a model that provides high accuracy when wearing a mask or a model that provides high accuracy when not wearing a mask.',
+        help='\
+            Select either a model that provides high accuracy when wearing \
+            a mask or a model that provides high accuracy when not wearing a mask.',
     )
     args = parser.parse_args()
     main(args)