FEAT: support MLX engine (#1765)

qinxuye · web-flow · commit 3cb13678305c · 2024-07-05T12:02:33.000+08:00
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -82,6 +82,7 @@ jobs:
           - { os: windows-latest, python-version: 3.10 }
         include:
           - { os: self-hosted, module: gpu, python-version: 3.9}
+          - { os: macos-latest, module: metal, python-version: "3.10" }
 
     steps:
       - name: Check out code
@@ -109,6 +110,9 @@ jobs:
             sudo rm -rf "/usr/local/share/boost"
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
           fi
+          if [ "$MODULE" == "metal" ]; then
+            pip install mlx-lm
+          fi
           pip install "llama-cpp-python==0.2.77"
           pip install transformers
           pip install attrdict
@@ -162,6 +166,10 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
               --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_chattts.py
+          elif [ "$MODULE" == "metal" ]; then
+            pytest --timeout=1500 \
+              -W ignore::PendingDeprecationWarning \
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py
           else
             pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
diff --git a/doc/source/models/builtin/llm/qwen2-instruct.rst b/doc/source/models/builtin/llm/qwen2-instruct.rst
@@ -206,7 +206,71 @@ chosen quantization method from the options listed above::
    xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 72 --model-format awq --quantization ${quantization}
 
 
-Model Spec 13 (ggufv2, 0_5 Billion)
+Model Spec 13 (mlx, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** Qwen/Qwen2-0.5B-Instruct-MLX
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-MLX>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct-MLX>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 0_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 14 (mlx, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** Qwen/Qwen2-1.5B-Instruct-MLX
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-MLX>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct-MLX>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 1_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 15 (mlx, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** Qwen/Qwen2-7B-Instruct-MLX
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2-7B-Instruct-MLX>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-7B-Instruct-MLX>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 7 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 16 (mlx, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2-72B-4bit
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen2-72B-4bit>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 72 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 17 (ggufv2, 0_5 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
@@ -222,7 +286,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization}
 
 
-Model Spec 14 (ggufv2, 1_5 Billion)
+Model Spec 18 (ggufv2, 1_5 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
@@ -238,7 +302,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization}
 
 
-Model Spec 15 (ggufv2, 7 Billion)
+Model Spec 19 (ggufv2, 7 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
@@ -254,7 +318,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-engine ${engine} --model-name qwen2-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
 
 
-Model Spec 16 (ggufv2, 72 Billion)
+Model Spec 20 (ggufv2, 72 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
diff --git a/setup.cfg b/setup.cfg
@@ -103,6 +103,7 @@ all =
     optimum
     outlines==0.0.34  # sglang errored for outlines > 0.0.34
     sglang[all] ; sys_platform=='linux'
+    mlx-lm ; sys_platform=='darwin' and platform_machine=='arm64'
     attrdict  # For deepseek VL
     timm>=0.9.16  # For deepseek VL
     torchvision  # For deepseek VL
@@ -143,6 +144,8 @@ vllm =
     vllm>=0.2.6
 sglang =
     sglang[all]
+mlx =
+    mlx-lm
 embedding =
     sentence-transformers>=2.7.0
 rerank =
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -34,6 +34,7 @@
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
+    MLX_CLASSES,
     SGLANG_CLASSES,
     SUPPORTED_ENGINES,
     TRANSFORMERS_CLASSES,
@@ -42,6 +43,7 @@
     GgmlLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
+    MLXLLMSpecV1,
     PromptStyleV1,
     PytorchLLMSpecV1,
     get_cache_status,
@@ -112,6 +114,7 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
+    from .mlx.core import MLXChatModel, MLXModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.cogvlm2 import CogVLM2Model
@@ -147,6 +150,7 @@ def _install():
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
+    MLX_CLASSES.extend([MLXModel, MLXChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
             BaichuanPytorchChatModel,
@@ -176,6 +180,7 @@ def _install():
     SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
     SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
     SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
+    SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
 
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -2549,6 +2549,38 @@
         ],
         "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
       },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
@@ -107,6 +107,28 @@ def validate_model_size_with_radix(cls, v: object) -> object:
         return v
 
 
+class MLXLLMSpecV1(BaseModel):
+    model_format: Literal["mlx"]
+    # Must in order that `str` first, then `int`
+    model_size_in_billions: Union[str, int]
+    quantizations: List[str]
+    model_id: Optional[str]
+    model_hub: str = "huggingface"
+    model_uri: Optional[str]
+    model_revision: Optional[str]
+
+    @validator("model_size_in_billions", pre=False)
+    def validate_model_size_with_radix(cls, v: object) -> object:
+        if isinstance(v, str):
+            if (
+                "_" in v
+            ):  # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
+                return v
+            else:
+                return int(v)
+        return v
+
+
 class PromptStyleV1(BaseModel):
     style_name: str
     system_prompt: str = ""
@@ -226,7 +248,7 @@ def parse_raw(
 
 
 LLMSpecV1 = Annotated[
-    Union[GgmlLLMSpecV1, PytorchLLMSpecV1],
+    Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
     Field(discriminator="model_format"),
 ]
 
@@ -249,6 +271,8 @@ def parse_raw(
 
 VLLM_CLASSES: List[Type[LLM]] = []
 
+MLX_CLASSES: List[Type[LLM]] = []
+
 LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
 
@@ -549,7 +573,7 @@ def _get_meta_path(
             return os.path.join(cache_dir, "__valid_download")
         else:
             return os.path.join(cache_dir, f"__valid_download_{model_hub}")
-    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
         assert quantization is not None
         if model_hub == "huggingface":
             return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -588,7 +612,7 @@ def _skip_download(
                     logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
                     return True
             return False
-    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
         assert quantization is not None
         return os.path.exists(
             _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -683,7 +707,7 @@ def cache_from_csghub(
     ):
         return cache_dir
 
-    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
         download_dir = retry_download(
             snapshot_download,
             llm_family.model_name,
@@ -751,7 +775,7 @@ def cache_from_modelscope(
     ):
         return cache_dir
 
-    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
         download_dir = retry_download(
             snapshot_download,
             llm_family.model_name,
@@ -820,8 +844,8 @@ def cache_from_huggingface(
     if not IS_NEW_HUGGINGFACE_HUB:
         use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
 
-    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
-        assert isinstance(llm_spec, PytorchLLMSpecV1)
+    if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
+        assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
         download_dir = retry_download(
             huggingface_hub.snapshot_download,
             llm_family.model_name,
@@ -910,7 +934,7 @@ def get_cache_status(
         ]
         return any(revisions)
     # just check meta file for ggml and gptq model
-    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
+    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
         ret = []
         for q in llm_spec.quantizations:
             assert q is not None
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -2921,6 +2921,33 @@
         "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct-MLX",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "qwen/Qwen2-1.5B-Instruct-MLX",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "qwen/Qwen2-7B-Instruct-MLX",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
diff --git a/xinference/model/llm/mlx/__init__.py b/xinference/model/llm/mlx/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
diff --git a/xinference/model/llm/mlx/tests/__init__.py b/xinference/model/llm/mlx/tests/__init__.py
diff --git a/xinference/model/llm/mlx/tests/test_mlx.py b/xinference/model/llm/mlx/tests/test_mlx.py
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py