Skip to content

[Performance] Dynamic Batch Tokenizer #9382

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
671 changes: 671 additions & 0 deletions benchmark/api/bench_common.py

Large diffs are not rendered by default.

122 changes: 122 additions & 0 deletions benchmark/api/bench_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
SGLang Embeddings Benchmark Script

This script benchmarks SGLang's /v1/embeddings API performance using HTTP requests.

Features:
- HTTP-only implementation
- Uses /v1/embeddings API endpoint directly
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions

Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_embeddings.py
"""

import asyncio
import logging

from bench_common import (
BenchmarkConfig,
generate_text_with_token_count,
run_benchmark_main,
run_generic_benchmark,
)

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [500]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False

# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/embeddings"

# Embeddings API Config
EMBEDDINGS_MODEL_PATH = "/shared/public/sharing/suramach/Qwen3-0.6B"
BATCH_SIZE = [1] # Number of items per request (batch size)

# Configurable input token length
EMBEDDINGS_INPUT_TOKENS = 500 # Default token length

# Generate input text with the specified token length
EMBEDDINGS_INPUT_TEXT = generate_text_with_token_count(
EMBEDDINGS_MODEL_PATH, EMBEDDINGS_INPUT_TOKENS, config.special_replicated_token
)


###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def build_embeddings_request(index: int, item_count: int) -> tuple:
"""Build a single embeddings request."""
try:
# For embeddings, input can be a string or list of strings
if item_count == 1:
input_data = EMBEDDINGS_INPUT_TEXT
else:
input_data = [EMBEDDINGS_INPUT_TEXT for _ in range(item_count)]
req = {
"input": input_data,
"model": EMBEDDINGS_MODEL_PATH,
}
return (index, req)
except Exception as e:
logger.error(f"Error building request {index}: {e}")
return (index, None)


def validate_embeddings_response(response_data: dict) -> bool:
"""Validate embeddings API response."""
return "data" in response_data


###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
"""Run a single embeddings benchmark with the given RPS value."""
return await run_generic_benchmark(
rps=rps,
duration_secs=duration_secs,
item_count=item_count,
config=config,
http_url=HTTP_URL,
build_request_func=build_embeddings_request,
response_validator=validate_embeddings_response,
api_name="EMBEDDINGS",
request_description="embeddings requests",
)


async def main():
additional_info = {
"Input text length": f"{EMBEDDINGS_INPUT_TOKENS} tokens",
"Input text preview": (
EMBEDDINGS_INPUT_TEXT[:100] + "..."
if len(EMBEDDINGS_INPUT_TEXT) > 100
else EMBEDDINGS_INPUT_TEXT
),
}

await run_benchmark_main(
config, run_benchmark, "EMBEDDINGS", HTTP_URL, BATCH_SIZE, additional_info
)


if __name__ == "__main__":
asyncio.run(main())
157 changes: 157 additions & 0 deletions benchmark/api/bench_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""
SGLang Scoring Benchmark Script

This script benchmarks SGLang's scoring API performance using HTTP requests.

Current Features:
- HTTP-only implementation (open source compatible)
- Uses /v1/score API endpoint directly
- Single item scoring with batching support
- Configurable RPS, duration, and batch sizes
- Progress tracking and detailed metrics
- Poisson and constant request distributions

Usage:
- Update configuration variables at the top of the file
- Ensure SGLang server is running on the configured HTTP_URL
- Run: python bench_score.py
- Each request will contain ITEM_COUNT_VALUES items for batch scoring

"""

import asyncio
import os

from bench_common import (
BenchmarkConfig,
generate_text_with_token_count,
run_benchmark_main,
run_generic_benchmark,
)
from transformers import AutoTokenizer

###############################################################################
# CONFIG
###############################################################################
# Create benchmark configuration
config = BenchmarkConfig()
config.rps_values = [70]
config.duration_secs_values = [60]
config.num_unique_requests = 100
config.distribution = "POISSON"
config.profile = False

# HTTP Configuration
HTTP_URL = "http://localhost:30000/v1/score" # Use score API directly

# Score API Config
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
SCORE_QUERY_TOKENS = 120
SCORE_ITEM_TOKENS = 180
SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
SCORE_LABEL_TOKEN_IDS = [9454, 2753] # Yes/No token IDs
ITEM_COUNT_VALUES = [10] # Number of items per request
# Directory for profiler output
SGLANG_TORCH_PROFILER_DIR = "/shared/user/sglang-oss-trace/remove-decode"
if config.profile:
os.environ["SGLANG_TORCH_PROFILER_DIR"] = SGLANG_TORCH_PROFILER_DIR

# Special token to replicate for precise token counting
SPECIAL_REPLICATED_TOKEN = "<|im_start|>"


###############################################################################
# REQUEST GENERATION (in parallel)
###############################################################################
def create_score_request_builder():
"""Create a score request builder function with shared tokenizer."""
# Load tokenizer once here to verify special token and get precise counts
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)

# Verify that our special token produces exactly 1 token
special_token_count = len(
tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
)
print(
f"Special token '{config.special_replicated_token}' produces "
f"{special_token_count} token(s)"
)

def generate_text_with_token_count_local(num_toks):
"""Generate text with precise token count using replicated token."""
return generate_text_with_token_count(
SCORE_MODEL_PATH, num_toks, config.special_replicated_token
)

def build_score_request(index: int, item_count: int) -> tuple:
"""Build a single score request."""
try:
# Generate query and items for score API
query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
items = [
generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
for _ in range(item_count)
]

# Return as dict for score API format
score_data = {
"query": query,
"items": items,
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
"model": SCORE_MODEL_PATH,
}
return (index, score_data)

except Exception as e:
print(f"Error building request {index}: {e}")
return (index, None)

return build_score_request


def validate_score_response(response_data: dict) -> bool:
"""Validate score API response."""
return "scores" in response_data or "logprobs" in response_data


###############################################################################
# MAIN
###############################################################################
async def run_benchmark(rps, duration_secs, item_count):
"""Run a single benchmark with the given RPS value."""
# Create the request builder function with shared tokenizer
build_request_func = create_score_request_builder()

return await run_generic_benchmark(
rps=rps,
duration_secs=duration_secs,
item_count=item_count,
config=config,
http_url=HTTP_URL,
build_request_func=build_request_func,
response_validator=validate_score_response,
api_name="SINGLE_ITEM_SCORING",
request_description="score requests",
)


async def main():
"""Main function that runs benchmarks for all RPS values."""
additional_info = {
"Query tokens per request": SCORE_QUERY_TOKENS,
"Item tokens per item": SCORE_ITEM_TOKENS,
}

await run_benchmark_main(
config,
run_benchmark,
"SINGLE_ITEM_SCORING",
HTTP_URL,
ITEM_COUNT_VALUES,
additional_info,
)


if __name__ == "__main__":
asyncio.run(main())
Loading
Loading