From d660858fb832e4858ca7d5a378c873e19d69eeeb Mon Sep 17 00:00:00 2001 From: Yuhao Tsui Date: Thu, 17 Oct 2024 11:41:34 +0800 Subject: [PATCH 1/4] Adding support for awq/gptq quantitative reasoning to VisionModel --- xinference/model/llm/vllm/core.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 5ecd01f81a..53001c1590 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -717,11 +717,26 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin): def match( cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format != "pytorch": + if not cls._has_cuda_device(): + return False + if not cls._is_linux(): + return False + if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]: return False if llm_spec.model_format == "pytorch": if quantization != "none" and not (quantization is None): return False + if llm_spec.model_format == "awq": + # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits. + if "4" not in quantization: + return False + if llm_spec.model_format == "gptq": + if VLLM_INSTALLED and vllm.__version__ >= "0.3.3": + if not any(q in quantization for q in ("3", "4", "8")): + return False + else: + if "4" not in quantization: + return False if isinstance(llm_family, CustomLLMFamilyV1): if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST: return False From b099d8d30e2bdf6990099a3edfa4e954a27d4515 Mon Sep 17 00:00:00 2001 From: Yuhao Tsui Date: Thu, 21 Nov 2024 14:33:18 +0800 Subject: [PATCH 2/4] Fix: Embedding Dimension err --- xinference/model/embedding/model_spec.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/embedding/model_spec.json b/xinference/model/embedding/model_spec.json index dc8d851b85..14d4ced519 100644 --- a/xinference/model/embedding/model_spec.json +++ b/xinference/model/embedding/model_spec.json @@ -233,7 +233,7 @@ }, { "model_name": "gte-Qwen2", - "dimensions": 4096, + "dimensions": 3584, "max_tokens": 32000, "language": ["zh", "en"], "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct", From 4aa1c13d08616d94b02a16356ed9c4823a48a93c Mon Sep 17 00:00:00 2001 From: Yuhao Tsui Date: Thu, 21 Nov 2024 14:34:02 +0800 Subject: [PATCH 3/4] Fix: Embedding Dimension err --- xinference/model/embedding/model_spec_modelscope.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/embedding/model_spec_modelscope.json b/xinference/model/embedding/model_spec_modelscope.json index 7f4715b865..dee409be0a 100644 --- a/xinference/model/embedding/model_spec_modelscope.json +++ b/xinference/model/embedding/model_spec_modelscope.json @@ -235,7 +235,7 @@ }, { "model_name": "gte-Qwen2", - "dimensions": 4096, + "dimensions": 3584, "max_tokens": 32000, "language": ["zh", "en"], "model_id": "iic/gte_Qwen2-7B-instruct", From 59bf7b674a7c7cbee370755475c7f696f4342ae0 Mon Sep 17 00:00:00 2001 From: Yuhao Tsui Date: Thu, 21 Nov 2024 14:35:28 +0800 Subject: [PATCH 4/4] Fix: Embedding Dimension err --- doc/source/models/builtin/embedding/gte-qwen2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/models/builtin/embedding/gte-qwen2.rst b/doc/source/models/builtin/embedding/gte-qwen2.rst index 85eeeac39a..cec2de89b6 100644 --- a/doc/source/models/builtin/embedding/gte-qwen2.rst +++ b/doc/source/models/builtin/embedding/gte-qwen2.rst @@ -11,7 +11,7 @@ gte-Qwen2 Specifications ^^^^^^^^^^^^^^ -- **Dimensions:** 4096 +- **Dimensions:** 3584 - **Max Tokens:** 32000 - **Model ID:** Alibaba-NLP/gte-Qwen2-7B-instruct - **Model Hubs**: `Hugging Face `__, `ModelScope `__