diff --git a/docs/parameters.md b/docs/parameters.md index 81f582335c..fb5d630395 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -46,6 +46,8 @@ When using FastDeploy to deploy models (including offline inference and service | ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 | | ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel | | ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting | +| ```served_model_name```| `str`| The model name used in the API. If not specified, the model name will be the same as the --model argument | +| ```revision``` | `str` | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. | | ```chat_template``` | `str` | Specify the template used for model concatenation, It supports both string input and file path input. The default value is None. If not specified, the model's default template will be used. | ## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```? diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md index e68d342f3c..b1fc39137e 100644 --- a/docs/zh/parameters.md +++ b/docs/zh/parameters.md @@ -44,6 +44,8 @@ | ```dynamic_load_weight``` | `int` | 是否动态加载权重,默认0 | | ```enable_expert_parallel``` | `bool` | 是否启用专家并行 | | ```enable_logprob``` | `bool` | 是否启用输出token返回logprob。如果未使用 logrpob,则在启动时可以省略此参数。 | +| ```served_model_name``` | `str` | API 中使用的模型名称,如果未指定,模型名称将与--model参数相同 | +| ```revision``` | `str` | 自动下载模型时,用于指定模型的Git版本,分支名或tag | | ```chat_template``` | `str` | 指定模型拼接使用的模板,支持字符串与文件路径,默认为None,如未指定,则使用模型默认模板 | ## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系? diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index af7b3ffb08..f80f45d98f 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -50,6 +50,10 @@ class EngineArgs: """ The name or path of the model to be used. """ + served_model_name: Optional[str] = None + """ + The name of the model being served. + """ revision: Optional[str] = "master" """ The revision for downloading models. @@ -379,6 +383,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.model, help="Model name or path to be used.", ) + model_group.add_argument( + "--served-model-name", + type=nullable_str, + default=EngineArgs.served_model_name, + help="Served model name", + ) model_group.add_argument( "--revision", type=nullable_str, diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index ca0b45e7f9..cc06b5f8ca 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -40,9 +40,11 @@ CompletionResponse, ControlSchedulerRequest, ErrorResponse, + ModelList, ) from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion +from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager from fastdeploy.metrics.metrics import ( EXCLUDE_LABELS, @@ -128,6 +130,15 @@ async def lifespan(app: FastAPI): else: pid = os.getpid() api_server_logger.info(f"{pid}") + + if args.served_model_name is not None: + served_model_names = args.served_model_name + verification = True + else: + served_model_names = args.model + verification = False + model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)] + engine_client = EngineClient( args.model, args.tokenizer, @@ -144,8 +155,22 @@ async def lifespan(app: FastAPI): args.tool_call_parser, ) app.state.dynamic_load_weight = args.dynamic_load_weight - chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time, chat_template) - completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time) + model_handler = OpenAIServingModels( + model_paths, + args.max_model_len, + args.ips, + ) + app.state.model_handler = model_handler + chat_handler = OpenAIServingChat( + engine_client, app.state.model_handler, pid, args.ips, args.max_waiting_time, chat_template + ) + completion_handler = OpenAIServingCompletion( + engine_client, + app.state.model_handler, + pid, + args.ips, + args.max_waiting_time, + ) engine_client.create_zmq_client(model=pid, mode=zmq.PUSH) engine_client.pid = pid app.state.engine_client = engine_client @@ -308,6 +333,23 @@ async def create_completion(request: CompletionRequest): return JSONResponse(status_code=e.status_code, content={"detail": e.detail}) +@app.get("/v1/models") +async def list_models() -> Response: + """ + List all available models. + """ + if app.state.dynamic_load_weight: + status, msg = app.state.engine_client.is_workers_alive() + if not status: + return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304) + + models = await app.state.model_handler.list_models() + if isinstance(models, ErrorResponse): + return JSONResponse(content=models.model_dump(), status_code=models.code) + elif isinstance(models, ModelList): + return JSONResponse(content=models.model_dump()) + + @app.get("/update_model_weight") def update_model_weight(request: Request) -> Response: """ diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 508c27f067..53c1e6a4fa 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -18,6 +18,7 @@ import json import time +import uuid from typing import Any, Dict, List, Literal, Optional, Union from pydantic import BaseModel, Field, model_validator @@ -55,6 +56,37 @@ class UsageInfo(BaseModel): prompt_tokens_details: Optional[PromptTokenUsageInfo] = None +class ModelPermission(BaseModel): + id: str = Field(default_factory=lambda: f"modelperm-{str(uuid.uuid4().hex)}") + object: str = "model_permission" + created: int = Field(default_factory=lambda: int(time.time())) + allow_create_engine: bool = False + allow_sampling: bool = True + allow_logprobs: bool = True + allow_search_indices: bool = False + allow_view: bool = True + allow_fine_tuning: bool = False + organization: str = "*" + group: Optional[str] = None + is_blocking: bool = False + + +class ModelInfo(BaseModel): + id: str + object: str = "model" + created: int = Field(default_factory=lambda: int(time.time())) + owned_by: str = "FastDeploy" + root: Optional[str] = None + parent: Optional[str] = None + max_model_len: Optional[int] = None + permission: list[ModelPermission] = Field(default_factory=list) + + +class ModelList(BaseModel): + object: str = "list" + data: list[ModelInfo] = Field(default_factory=list) + + class FunctionCall(BaseModel): """ Function call. diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index ba277a3874..77ad18dfcc 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -49,8 +49,9 @@ class OpenAIServingChat: OpenAI-style chat completions serving """ - def __init__(self, engine_client, pid, ips, max_waiting_time, chat_template): + def __init__(self, engine_client, models, pid, ips, max_waiting_time, chat_template): self.engine_client = engine_client + self.models = models self.pid = pid self.master_ip = ips self.max_waiting_time = max_waiting_time @@ -78,6 +79,13 @@ async def create_chat_completion(self, request: ChatCompletionRequest): err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}" api_server_logger.error(err_msg) return ErrorResponse(message=err_msg, code=400) + + is_supported, request.model = self.models.is_supported_model(request.model) + if not is_supported: + err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default" + api_server_logger.error(err_msg) + return ErrorResponse(message=err_msg, code=400) + try: if self.max_waiting_time < 0: await self.engine_client.semaphore.acquire() diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index fdcf106ba6..ea3303317d 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -41,8 +41,9 @@ class OpenAIServingCompletion: - def __init__(self, engine_client, pid, ips, max_waiting_time): + def __init__(self, engine_client, models, pid, ips, max_waiting_time): self.engine_client = engine_client + self.models = models self.pid = pid self.master_ip = ips self.host_ip = get_host_ip() @@ -68,6 +69,11 @@ async def create_completion(self, request: CompletionRequest): err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}" api_server_logger.error(err_msg) return ErrorResponse(message=err_msg, code=400) + is_supported, request.model = self.models.is_supported_model(request.model) + if not is_supported: + err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default" + api_server_logger.error(err_msg) + return ErrorResponse(message=err_msg, code=400) created_time = int(time.time()) if request.user is not None: request_id = f"cmpl-{request.user}-{uuid.uuid4()}" diff --git a/fastdeploy/entrypoints/openai/serving_models.py b/fastdeploy/entrypoints/openai/serving_models.py new file mode 100644 index 0000000000..4b1f5cb099 --- /dev/null +++ b/fastdeploy/entrypoints/openai/serving_models.py @@ -0,0 +1,96 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from dataclasses import dataclass +from typing import List, Union + +from fastdeploy.entrypoints.openai.protocol import ( + ErrorResponse, + ModelInfo, + ModelList, + ModelPermission, +) +from fastdeploy.utils import api_server_logger, get_host_ip + + +@dataclass +class ModelPath: + name: str + model_path: str + verification: bool = False + + +class OpenAIServingModels: + """ + OpenAI-style models serving + """ + + def __init__( + self, + model_paths: list[ModelPath], + max_model_len: int, + ips: Union[List[str], str], + ): + self.model_paths = model_paths + self.max_model_len = max_model_len + self.master_ip = ips + self.host_ip = get_host_ip() + if self.master_ip is not None: + if isinstance(self.master_ip, list): + self.master_ip = self.master_ip[0] + else: + self.master_ip = self.master_ip.split(",")[0] + + def _check_master(self): + if self.master_ip is None: + return True + if self.host_ip == self.master_ip: + return True + return False + + def is_supported_model(self, model_name) -> tuple[bool, str]: + """ + Check whether the specified model is supported. + """ + if self.model_paths[0].verification is False: + return True, self.model_name() + if model_name == "default": + return True, self.model_name() + return any(model.name == model_name for model in self.model_paths), model_name + + def model_name(self) -> str: + """ + Returns the current model name. + """ + return self.model_paths[0].name + + async def list_models(self) -> ModelList: + """ + Show available models. + """ + if not self._check_master(): + err_msg = ( + f"Only master node can accept models request, please send request to master node: {self.pod_ips[0]}" + ) + api_server_logger.error(err_msg) + return ErrorResponse(message=err_msg, code=400) + model_infos = [ + ModelInfo( + id=model.name, max_model_len=self.max_model_len, root=model.model_path, permission=[ModelPermission()] + ) + for model in self.model_paths + ] + return ModelList(data=model_infos) diff --git a/tests/ce/server/test_evil_cases.py b/tests/ce/server/test_evil_cases.py index 874b520b83..aba46cd09d 100644 --- a/tests/ce/server/test_evil_cases.py +++ b/tests/ce/server/test_evil_cases.py @@ -321,7 +321,7 @@ def test_model_invalid(): payload = build_request_payload(TEMPLATE, data) resp = send_request(URL, payload).json() assert resp.get("object") == "chat.completion", "不存在的 model 应触发校验异常" - assert "non-existent-model" in resp.get("model"), "未返回预期的 model 信息" + # assert "non-existent-model" in resp.get("model"), "未返回预期的 model 信息" assert len(resp.get("choices")[0].get("message").get("content")) > 0, "模型名为不存在的 model,未正常生成回复" @@ -341,7 +341,7 @@ def test_model_with_special_characters(): payload = build_request_payload(TEMPLATE, data) resp = send_request(URL, payload).json() assert resp.get("object") == "chat.completion", "不存在的 model 应触发校验异常" - assert "!@#" in resp.get("model"), "未返回预期的 model 信息" + # assert "!@#" in resp.get("model"), "未返回预期的 model 信息" assert ( len(resp.get("choices")[0].get("message").get("content")) > 0 ), "模型名为model 参数为非法格式,未正常生成回复" diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py new file mode 100644 index 0000000000..cb8100f00e --- /dev/null +++ b/tests/entrypoints/openai/test_serving_models.py @@ -0,0 +1,51 @@ +import asyncio +import unittest + +from fastdeploy.entrypoints.openai.protocol import ModelInfo, ModelList +from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels +from fastdeploy.utils import get_host_ip + +MODEL_NAME = "baidu/ERNIE-4.5-0.3B-PT" +MODEL_PATHS = [ModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] +MAX_MODEL_LEN = 2048 + + +async def _async_serving_models_init() -> OpenAIServingModels: + """异步初始化 OpenAIServingModels 实例""" + return OpenAIServingModels( + model_paths=MODEL_PATHS, + max_model_len=MAX_MODEL_LEN, + ips=get_host_ip(), + ) + + +class TestOpenAIServingModels(unittest.TestCase): + """测试 OpenAIServingModels 的 unittest 版本""" + + def test_serving_model_name(self): + """测试模型名称获取""" + # 通过 asyncio.run() 执行异步初始化 + serving_models = asyncio.run(_async_serving_models_init()) + self.assertEqual(serving_models.model_name(), MODEL_NAME) + + def test_list_models(self): + """测试模型列表功能""" + serving_models = asyncio.run(_async_serving_models_init()) + + # 通过 asyncio.run() 执行异步方法 + result = asyncio.run(serving_models.list_models()) + + # 验证返回类型和内容 + self.assertIsInstance(result, ModelList) + self.assertEqual(len(result.data), 1) + + model_info = result.data[0] + self.assertIsInstance(model_info, ModelInfo) + self.assertEqual(model_info.id, MODEL_NAME) + self.assertEqual(model_info.max_model_len, MAX_MODEL_LEN) + self.assertEqual(model_info.root, MODEL_PATHS[0].model_path) + self.assertEqual(result.object, "list") + + +if __name__ == "__main__": + unittest.main()