Skip to content

Commit a436d4b

Browse files
committed
feat: add model higgs-audio
1 parent 92e8fef commit a436d4b

File tree

9 files changed

+160
-0
lines changed

9 files changed

+160
-0
lines changed

src/emd/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
vlms,
1818
comfyui,
1919
asr,
20+
audio,
2021
embeddings,
2122
reranks,
2223
custom,

src/emd/models/audio/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from . import higgs_audio

src/emd/models/audio/higgs_audio.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from .. import Model
2+
from ..engines import vllm_higgs_audio_engine091
3+
from ..model_series import HIGGS_AUDIO_SERIES
4+
from ..instances import (
5+
g5d48xlarge_instance,
6+
local_instance
7+
)
8+
from ..services import (
9+
sagemaker_service,
10+
sagemaker_async_service,
11+
ecs_service,
12+
local_service
13+
)
14+
from ..frameworks import fastapi_framework
15+
from emd.models.utils.constants import ModelType
16+
17+
Model.register(
18+
dict(
19+
model_id="bosonai-higgs-audio-v2-generation-3B-base",
20+
model_type=ModelType.AUDIO,
21+
description="Higgs Audio v2 Generation 3B Base is a powerful multimodal audio generation model that supports voice cloning, smart voice generation, and multi-speaker synthesis. Built on vLLM engine with OpenAI-compatible API for text-to-speech and audio generation tasks.",
22+
application_scenario="voice cloning, text-to-speech, audio generation, multi-speaker synthesis, smart voice generation",
23+
supported_engines=[vllm_higgs_audio_engine091],
24+
supported_instances=[
25+
g5d48xlarge_instance, local_instance
26+
],
27+
supported_services=[
28+
sagemaker_service, local_service
29+
],
30+
supported_frameworks=[
31+
fastapi_framework
32+
],
33+
allow_china_region=True,
34+
huggingface_model_id="bosonai/higgs-audio-v2-generation-3B-base",
35+
require_huggingface_token=False,
36+
need_prepare_model=False,
37+
)
38+
)

src/emd/models/engines.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,17 @@ class KtransformersEngine(OpenAICompitableEngine):
555555
"description": "VLLM v0.9.1 engine for dots.ocr multilingual document parsing model with flash-attn support and eager execution for custom models"
556556
})
557557

558+
# VLLM Engine v0.9.1 for Higgs Audio
559+
vllm_higgs_audio_engine091 = VllmEngine(**{
560+
**vllm_engine064.model_dump(),
561+
"engine_dockerfile_config": {"VERSION":"v0.9.1"},
562+
"dockerfile_name": "Dockerfile_higgs_audio",
563+
"engine_cls": "vllm.higgs_audio_backend.HiggsAudioBackend",
564+
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
565+
"default_cli_args": " --shm-size=30gb",
566+
"description": "VLLM v0.9.1 engine for Higgs Audio v2 Generation 3B Base multimodal audio generation model using native Docker entrypoint"
567+
})
568+
558569
custom_engine = Engine(**{
559570
"engine_type":EngineType.CUSTOM,
560571
})

src/emd/models/model_series.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,9 @@
163163
description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.",
164164
reference_link="https://github.com/rednote-hilab/dots.ocr"
165165
)
166+
167+
HIGGS_AUDIO_SERIES = ModelSeries(
168+
model_series_name=ModelSeriesType.HIGGS_AUDIO,
169+
description="Higgs Audio v2 Generation is a powerful multimodal audio generation model that supports voice cloning, smart voice generation, and multi-speaker synthesis. Built on advanced neural architectures, it provides high-quality text-to-speech capabilities with support for various audio generation tasks including voice cloning and multi-speaker scenarios.",
170+
reference_link="https://huggingface.co/bosonai/higgs-audio-v2-generation-3B-base"
171+
)

src/emd/models/utils/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ class ModelType(ConstantBase):
142142
VLM = "vlm"
143143
EMBEDDING = "embedding"
144144
VIDEO = "video"
145+
AUDIO = "audio"
145146

146147
class ServiceCode(ConstantBase):
147148
SAGEMAKER = "sagemaker"
@@ -236,3 +237,4 @@ class ModelSeriesType(ConstantBase):
236237
DEEPSEEK_v3 = "deepseek v3"
237238
BAICHUAN = "baichuan"
238239
DOTS_OCR = "dots_ocr"
240+
HIGGS_AUDIO = "higgs_audio"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
FROM public.ecr.aws/aws-gcr-solutions/dmaa/higgs-audio-vllm:latest AS base
2+
3+
WORKDIR /opt/ml/code
4+
5+
# Copy the workspace from the base image itself
6+
COPY /vllm-workspace/ /opt/ml/code/
7+
8+
EXPOSE 8080
9+
10+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.bosonai.api_server"]

src/pipeline/backend/vllm/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# VLLM Backend Module
2+
from .vllm_backend import VLLMBackend
3+
from .higgs_audio_backend import HiggsAudioBackend
4+
5+
__all__ = ['VLLMBackend', 'HiggsAudioBackend']
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import httpx
2+
import sys
3+
import os
4+
from emd.models.utils.constants import ModelType
5+
import inspect
6+
from backend.backend import OpenAICompitableProxyBackendBase
7+
from emd.utils.logger_utils import get_logger
8+
9+
logger = get_logger(__name__)
10+
11+
class HiggsAudioBackend(OpenAICompitableProxyBackendBase):
12+
"""
13+
Higgs Audio Backend that uses the Docker image's native entrypoint
14+
instead of the standard vLLM serve command.
15+
16+
This backend is specifically designed for the Higgs Audio v2 Generation 3B Base model
17+
which provides its own API server via the pre-built Docker image with entrypoint:
18+
["python3", "-m", "vllm.entrypoints.bosonai.api_server"]
19+
"""
20+
21+
def before_start(self,model_dir=None):
22+
logger.info(f"before_startbefore_startbefore_startbefore_startbefore_start")
23+
24+
def create_proxy_server_start_command(self, model_path):
25+
return f'python3 -m vllm.entrypoints.bosonai.api_server --served-model-name higgs-audio-v2-generation-3B-base --model bosonai/higgs-audio-v2-generation-3B-base --audio-tokenizer-type bosonai/higgs-audio-v2-tokenizer --limit-mm-per-prompt audio=50 --max-model-len 8192 --tensor-parallel-size 8 --pipeline-parallel-size 1 --port 8000 --gpu-memory-utilization 0.65 --disable-mm-preprocessor-cache'
26+
27+
def openai_create_helper(self, fn: callable, request: dict):
28+
"""
29+
Helper method to handle OpenAI-compatible API calls with extra parameters.
30+
"""
31+
sig = inspect.signature(fn)
32+
extra_body = request.get("extra_body", {})
33+
extra_params = {k: request.pop(k) for k in list(request.keys()) if k not in sig.parameters}
34+
extra_body.update(extra_params)
35+
request['extra_body'] = extra_body
36+
return fn(**request)
37+
38+
def invoke(self, request):
39+
"""
40+
Invoke the Higgs Audio model with OpenAI-compatible API.
41+
Supports audio modalities for voice cloning, smart voice generation, and multi-speaker synthesis.
42+
"""
43+
# Transform input to Higgs Audio format
44+
request = self._transform_request(request)
45+
46+
logger.info(f"Higgs Audio request: {request}")
47+
48+
# Handle different model types - Higgs Audio is primarily for audio generation
49+
if self.model_type == ModelType.AUDIO:
50+
# Use chat completions endpoint for audio generation
51+
response = self.openai_create_helper(self.client.chat.completions.create, request)
52+
else:
53+
# Fallback to standard chat completions
54+
response = self.openai_create_helper(self.client.chat.completions.create, request)
55+
56+
logger.info(f"Higgs Audio response: {response}, request: {request}")
57+
58+
if request.get("stream", False):
59+
return self._transform_streaming_response(response)
60+
else:
61+
return self._transform_response(response)
62+
63+
async def ainvoke(self, request):
64+
"""
65+
Async invoke the Higgs Audio model with OpenAI-compatible API.
66+
"""
67+
# Transform input to Higgs Audio format
68+
request = self._transform_request(request)
69+
70+
logger.info(f"Higgs Audio async request: {request}")
71+
72+
# Handle different model types - Higgs Audio is primarily for audio generation
73+
if self.model_type == ModelType.AUDIO:
74+
# Use chat completions endpoint for audio generation
75+
response = await self.openai_create_helper(self.async_client.chat.completions.create, request)
76+
else:
77+
# Fallback to standard chat completions
78+
response = await self.openai_create_helper(self.async_client.chat.completions.create, request)
79+
80+
logger.info(f"Higgs Audio async response: {response}, request: {request}")
81+
82+
if request.get("stream", False):
83+
logger.info(f"Higgs Audio streaming response: {response}")
84+
return await self._atransform_streaming_response(response)
85+
else:
86+
return await self._atransform_response(response)

0 commit comments

Comments
 (0)