update benchmark tools

ZhangYulongg · ZhangYulongg · commit 9606fb19c12b · 2025-07-24T14:43:11.000+08:00
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -41,7 +41,10 @@ python -m pip install -r requirements.txt
 --metric-percentiles 80,95,99,99.9,99.95,99.99：性能结果中展示的性能指标分位值
 --num-prompts 1：总计发送多少条请求
 --max-concurrency 1：压测并发数
---save-result：开启结果保存，结果文件会存入json
+--save-result：开启结果保存，结果文件会存入json，默认False不保存
+--debug：开启debug模式，逐条打印payload和output内容，默认False
+--shuffle：是否打乱数据集，默认False不打乱
+--seed：打乱数据集时的随机种子，默认0
 ```
 
 ##### /v1/chat/completions接口压测单条数据调试
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -50,6 +50,7 @@ class RequestFuncInput:
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
     language: Optional[str] = None
+    debug: bool = False
 
 
 @dataclass
@@ -98,7 +99,8 @@ async def async_request_eb_openai_chat_completions(
         if request_func_input.ignore_eos:
             payload["ignore_eos"] = request_func_input.ignore_eos
 
-        print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
+        if request_func_input.debug:
+            print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
 
         headers = {
             "Content-Type": "application/json",
@@ -179,7 +181,8 @@ async def async_request_eb_openai_chat_completions(
                 f.write(str(output) + "\n")
     if pbar:
         pbar.update(1)
-    print("#####final_output:", output)
+    if request_func_input.debug:
+        print("#####final_output:", output)
     return output
 
 
@@ -209,7 +212,8 @@ async def async_request_eb_openai_completions(
         if request_func_input.ignore_eos:
             payload["ignore_eos"] = request_func_input.ignore_eos
 
-        print("payload:", json.dumps(payload, ensure_ascii=False))
+        if request_func_input.debug:
+            print("payload:", json.dumps(payload, ensure_ascii=False))
 
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@@ -288,7 +292,8 @@ async def async_request_eb_openai_completions(
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
 
-        print(f"final_output:{output}")
+        if request_func_input.debug:
+            print(f"final_output:{output}")
 
     if pbar:
         pbar.update(1)
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
@@ -57,6 +57,7 @@ def __init__(
         self,
         dataset_path: Optional[str] = None,
         random_seed: int = DEFAULT_SEED,
+        shuffle: bool = False,
         hyperparameter_path: Optional[str] = None,
     ) -> None:
         """
@@ -72,6 +73,7 @@ def __init__(
         # default seed.
         self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
         self.data = None
+        self.shuffle = shuffle
         self.hyperparameter_path = hyperparameter_path
         self.hyperparameters = {}
 
@@ -211,6 +213,10 @@ def load_data(self) -> None:
         with open(self.dataset_path, encoding="utf-8") as f:
             self.data = [json.loads(i.strip()) for i in f.readlines()]
 
+        if self.shuffle:
+            random.seed(self.random_seed)
+            random.shuffle(self.data)
+
     def sample(
         self,
         num_requests: int,
@@ -270,6 +276,10 @@ def load_data(self) -> None:
         with open(self.dataset_path, encoding="utf-8") as f:
             self.data = [json.loads(i.strip()) for i in f.readlines()]
 
+        if self.shuffle:
+            random.seed(self.random_seed)
+            random.shuffle(self.data)
+
     def sample(
         self,
         num_requests: int,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -317,6 +317,7 @@ async def benchmark(
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
     ignore_eos: bool,
+    debug: bool,
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
     lora_modules: Optional[Iterable[str]],
@@ -348,6 +349,7 @@ async def benchmark(
         output_len=test_output_len,
         logprobs=logprobs,
         ignore_eos=ignore_eos,
+        debug=debug,
         extra_body=extra_body,
     )
 
@@ -435,6 +437,7 @@ async def limited_request_func(request_func_input, pbar):
             api_url=api_url,
             output_len=output_len,
             logprobs=logprobs,
+            debug=debug,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
         )
@@ -819,11 +822,12 @@ def main(args: argparse.Namespace):
 
     # For datasets that follow a similar structure, use a mapping.
     dataset_mapping = {
-        "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+        "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path, shuffle=args.shuffle).sample(
             num_requests=args.num_prompts,
             output_len=args.sharegpt_output_len,
         ),
-        "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+        "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path,
+                                        shuffle=args.shuffle).sample(
             num_requests=args.num_prompts,
             output_len=args.sharegpt_output_len,
         ),
@@ -883,6 +887,7 @@ def main(args: argparse.Namespace):
             selected_percentile_metrics=args.percentile_metrics.split(","),
             selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
             ignore_eos=args.ignore_eos,
+            debug=args.debug,
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
             lora_modules=args.lora_modules,
@@ -1071,6 +1076,11 @@ def main(args: argparse.Namespace):
         "results in a more uniform arrival of requests.",
     )
     parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="shuffle dataset",
+    )
     parser.add_argument(
         "--trust-remote-code",
         action="store_true",
@@ -1091,6 +1101,11 @@ def main(args: argparse.Namespace):
         action="store_true",
         help="Specify to save benchmark results to a json file",
     )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="print debug information (output)",
+    )
     parser.add_argument(
         "--save-detailed",
         action="store_true",