Skip to content

[Feature] Models api #3073

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 38 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
1c2e05a
add v1/models interface related
Yzc216 Jul 29, 2025
f421568
add model parameters
Yzc216 Jul 29, 2025
5446d4a
default model verification
Yzc216 Jul 29, 2025
a57341c
unit test
Yzc216 Jul 29, 2025
81ce789
check model err_msg
Yzc216 Jul 29, 2025
2cba65d
unit test
Yzc216 Jul 30, 2025
a1fe0fc
Merge branch 'develop' into modelAPI
Yzc216 Jul 30, 2025
0cf44da
type annotation
Yzc216 Jul 30, 2025
a214565
model parameter in response
Yzc216 Jul 30, 2025
bba5c0e
modify document description
Yzc216 Jul 30, 2025
ca08ee1
modify document description
Yzc216 Jul 30, 2025
5fb3957
Merge branch 'develop' into modelAPI
Yzc216 Jul 30, 2025
be13064
Merge remote-tracking branch 'upstream/develop' into modelAPI
Yzc216 Aug 6, 2025
b775512
Merge branch 'develop' into modelAPI
Yzc216 Aug 6, 2025
c1039f4
Merge branch 'develop' into modelAPI
Yzc216 Aug 7, 2025
08ef40f
unit test
Yzc216 Aug 7, 2025
7a5b686
Merge branch 'develop' into modelAPI
Yzc216 Aug 7, 2025
8653948
Merge branch 'develop' into modelAPI
Yzc216 Aug 7, 2025
a1090d6
Merge branch 'develop' into modelAPI
Yzc216 Aug 7, 2025
eaea619
Merge remote-tracking branch 'upstream/develop' into modelAPI
Yzc216 Aug 11, 2025
5ad7d0f
verification
Yzc216 Aug 11, 2025
99515ef
Merge branch 'develop' into modelAPI
Yzc216 Aug 11, 2025
748abc4
Merge branch 'develop' into modelAPI
Yzc216 Aug 11, 2025
ce47277
verification update
Yzc216 Aug 11, 2025
e2940c0
model_name
Yzc216 Aug 11, 2025
e5f4890
Merge branch 'develop' into modelAPI
Yzc216 Aug 11, 2025
24984ee
Merge branch 'develop' into modelAPI
Yzc216 Aug 11, 2025
bcf252e
Merge branch 'develop' into modelAPI
Yzc216 Aug 11, 2025
2205ec3
Merge branch 'develop' into modelAPI
Yzc216 Aug 12, 2025
2cc63c1
Merge branch 'develop' into modelAPI
Yzc216 Aug 12, 2025
4359200
Merge branch 'develop' into modelAPI
Yzc216 Aug 14, 2025
ac1de94
Merge branch 'develop' into modelAPI
Yzc216 Aug 15, 2025
25e2250
Merge branch 'develop' into modelAPI
LiqinruiG Aug 19, 2025
8a282da
Merge branch 'develop' into modelAPI
LiqinruiG Aug 19, 2025
7c51a55
Merge branch 'develop' into modelAPI
LiqinruiG Aug 19, 2025
ca68e24
pre-commit
LiqinruiG Aug 19, 2025
6472ef7
update test case
LiqinruiG Aug 19, 2025
ab550e7
resolve conflict
LiqinruiG Aug 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class EngineArgs:
"""
The name or path of the model to be used.
"""
served_model_name: Optional[str] = None
"""
The name of the model being served.
"""
revision: Optional[str] = "master"
"""
The revision for downloading models.
Expand Down Expand Up @@ -344,6 +348,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
default=EngineArgs.model,
help="Model name or path to be used.",
)
model_group.add_argument(
"--served-model-name",
type=nullable_str,
default=EngineArgs.served_model_name,
help="Served model name",
)
model_group.add_argument(
"--revision",
type=nullable_str,
Expand Down
47 changes: 45 additions & 2 deletions fastdeploy/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@
CompletionResponse,
ControlSchedulerRequest,
ErrorResponse,
ModelList,
)
from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels
from fastdeploy.metrics.metrics import (
EXCLUDE_LABELS,
cleanup_prometheus_files,
Expand Down Expand Up @@ -104,6 +106,13 @@ async def lifespan(app: FastAPI):
else:
pid = os.getpid()
api_server_logger.info(f"{pid}")

if args.served_model_name is not None:
served_model_names = args.served_model_name
else:
served_model_names = args.model
model_paths = [ModelPath(name=served_model_names, model_path=args.model)]

engine_client = EngineClient(
args.tokenizer,
args.max_model_len,
Expand All @@ -116,8 +125,25 @@ async def lifespan(app: FastAPI):
args.data_parallel_size,
)
app.state.dynamic_load_weight = args.dynamic_load_weight
chat_handler = OpenAIServingChat(engine_client, pid, args.ips)
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips)
model_handler = OpenAIServingModels(
engine_client,
model_paths,
args.max_model_len,
args.ips,
)
app.state.model_handler = model_handler
chat_handler = OpenAIServingChat(
engine_client,
app.state.model_handler,
pid,
args.ips,
)
completion_handler = OpenAIServingCompletion(
engine_client,
app.state.model_handler,
pid,
args.ips,
)
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
engine_client.pid = pid
app.state.engine_client = engine_client
Expand Down Expand Up @@ -232,6 +258,23 @@ async def create_completion(request: CompletionRequest):
return StreamingResponse(content=generator, media_type="text/event-stream")


@app.get("/v1/models")
async def list_models() -> Response:
"""
List all available models.
"""
if app.state.dynamic_load_weight:
status, msg = app.state.engine_client.is_workers_alive()
if not status:
return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)

models = await app.state.model_handler.list_models()
if isinstance(models, ErrorResponse):
return JSONResponse(content=models.model_dump(), status_code=models.code)
elif isinstance(models, ModelList):
return JSONResponse(content=models.model_dump())


@app.get("/update_model_weight")
def update_model_weight(request: Request) -> Response:
"""
Expand Down
32 changes: 32 additions & 0 deletions fastdeploy/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import json
import time
import uuid
from typing import Any, List, Literal, Optional, Union

from pydantic import BaseModel, Field, model_validator
Expand Down Expand Up @@ -55,6 +56,37 @@ class UsageInfo(BaseModel):
prompt_tokens_details: Optional[PromptTokenUsageInfo] = None


class ModelPermission(BaseModel):
id: str = Field(default_factory=lambda: f"modelperm-{str(uuid.uuid4().hex)}")
object: str = "model_permission"
created: int = Field(default_factory=lambda: int(time.time()))
allow_create_engine: bool = False
allow_sampling: bool = True
allow_logprobs: bool = True
allow_search_indices: bool = False
allow_view: bool = True
allow_fine_tuning: bool = False
organization: str = "*"
group: Optional[str] = None
is_blocking: bool = False


class ModelInfo(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "FastDeploy"
root: Optional[str] = None
parent: Optional[str] = None
max_model_len: Optional[int] = None
permission: list[ModelPermission] = Field(default_factory=list)


class ModelList(BaseModel):
object: str = "list"
data: list[ModelInfo] = Field(default_factory=list)


class FunctionCall(BaseModel):
"""
Function call.
Expand Down
19 changes: 17 additions & 2 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
import time
import traceback
import uuid
from typing import List, Optional
from typing import List, Optional, Union

import aiozmq
import msgpack
import numpy as np
from aiozmq import zmq

from fastdeploy.entrypoints.engine_client import EngineClient
from fastdeploy.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
Expand All @@ -39,6 +40,7 @@
PromptTokenUsageInfo,
UsageInfo,
)
from fastdeploy.entrypoints.openai.serving_models import OpenAIServingModels
from fastdeploy.metrics.work_metrics import work_process_metrics
from fastdeploy.utils import api_server_logger, get_host_ip
from fastdeploy.worker.output import LogprobsLists
Expand All @@ -49,8 +51,15 @@ class OpenAIServingChat:
OpenAI-style chat completions serving
"""

def __init__(self, engine_client, pid, ips):
def __init__(
self,
engine_client: EngineClient,
models: OpenAIServingModels,
pid: int,
ips: Union[List[str], str],
):
self.engine_client = engine_client
self.models = models
self.pid = pid
self.master_ip = ips
self.host_ip = get_host_ip()
Expand All @@ -76,6 +85,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
if request.model == "default":
request.model = self.models.model_name()
if not self.models.is_supported_model(request.model):
err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
if request.user is not None:
request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
else:
Expand Down
19 changes: 17 additions & 2 deletions fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
import asyncio
import time
import uuid
from typing import List
from typing import List, Union

import aiozmq
import msgpack
import numpy as np
from aiozmq import zmq

from fastdeploy.engine.request import RequestOutput
from fastdeploy.entrypoints.engine_client import EngineClient
from fastdeploy.entrypoints.openai.protocol import (
CompletionRequest,
CompletionResponse,
Expand All @@ -34,12 +35,20 @@
ErrorResponse,
UsageInfo,
)
from fastdeploy.entrypoints.openai.serving_models import OpenAIServingModels
from fastdeploy.utils import api_server_logger, get_host_ip


class OpenAIServingCompletion:
def __init__(self, engine_client, pid, ips):
def __init__(
self,
engine_client: EngineClient,
models: OpenAIServingModels,
pid: int,
ips: Union[List[str], str],
):
self.engine_client = engine_client
self.models = models
self.pid = pid
self.master_ip = ips
self.host_ip = get_host_ip()
Expand All @@ -64,6 +73,12 @@ async def create_completion(self, request: CompletionRequest):
err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
if request.model == "default":
request.model = self.models.model_name()
if not self.models.is_supported_model(request.model):
err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
created_time = int(time.time())
if request.user is not None:
request_id = f"cmpl-{request.user}-{uuid.uuid4()}"
Expand Down
96 changes: 96 additions & 0 deletions fastdeploy/entrypoints/openai/serving_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from dataclasses import dataclass
from typing import List, Union

from fastdeploy.entrypoints.engine_client import EngineClient
from fastdeploy.entrypoints.openai.protocol import (
ErrorResponse,
ModelInfo,
ModelList,
ModelPermission,
)
from fastdeploy.utils import api_server_logger, get_host_ip


@dataclass
class ModelPath:
name: str
model_path: str


class OpenAIServingModels:
"""
OpenAI-style models serving
"""

def __init__(
self,
engine_client: EngineClient,
model_paths: list[ModelPath],
max_model_len: int,
ips: Union[List[str], str],
):
self.engine_client = engine_client
self.model_paths = model_paths
self.max_model_len = max_model_len
self.master_ip = ips
self.host_ip = get_host_ip()
if self.master_ip is not None:
if isinstance(self.master_ip, list):
self.master_ip = self.master_ip[0]
else:
self.master_ip = self.master_ip.split(",")[0]

def _check_master(self):
if self.master_ip is None:
return True
if self.host_ip == self.master_ip:
return True
return False

def is_supported_model(self, model_name) -> bool:
"""
Check whether the specified model is supported.
"""
if model_name == "default":
return True
return any(model.name == model_name for model in self.model_paths)

def model_name(self) -> str:
"""
Returns the current model name.
"""
return self.model_paths[0].name

async def list_models(self) -> ModelList:
"""
Show available models.
"""
if not self._check_master():
err_msg = (
f"Only master node can accept models request, please send request to master node: {self.pod_ips[0]}"
)
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
model_infos = [
ModelInfo(
id=model.name, max_model_len=self.max_model_len, root=model.model_path, permission=[ModelPermission()]
)
for model in self.model_paths
]
return ModelList(data=model_infos)
44 changes: 44 additions & 0 deletions test/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from unittest.mock import MagicMock

import pytest

from fastdeploy.entrypoints.engine_client import EngineClient
from fastdeploy.entrypoints.openai.protocol import ModelInfo, ModelList
from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels
from fastdeploy.utils import get_host_ip

MODEL_NAME = "baidu/ERNIE-4.5-0.3B-PT"
MODEL_PATHS = [ModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
MAX_MODEL_LEN = 2048


async def _async_serving_models_init() -> OpenAIServingModels:
mock_engine_client = MagicMock(spec=EngineClient)

serving_models = OpenAIServingModels(
engine_client=mock_engine_client,
model_paths=MODEL_PATHS,
max_model_len=MAX_MODEL_LEN,
ips=get_host_ip(),
)

return serving_models


@pytest.mark.asyncio
async def test_serving_model_name():
serving_models = await _async_serving_models_init()
assert serving_models.model_name() == MODEL_NAME


@pytest.mark.asyncio
async def test_list_models():
serving_models = await _async_serving_models_init()
result = await serving_models.list_models()
assert isinstance(result, ModelList)
assert isinstance(result.data[0], ModelInfo)
assert result.object == "list"
assert len(result.data) == 1
assert result.data[0].id == MODEL_NAME
assert result.data[0].max_model_len == MAX_MODEL_LEN
assert result.data[0].root == MODEL_PATHS[0].model_path
Loading