From d660858fb832e4858ca7d5a378c873e19d69eeeb Mon Sep 17 00:00:00 2001
From: Yuhao Tsui <mail@cuiyuhao.com>
Date: Thu, 17 Oct 2024 11:41:34 +0800
Subject: [PATCH 1/4] Adding support for awq/gptq quantitative reasoning to
 VisionModel

---
 xinference/model/llm/vllm/core.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 5ecd01f81a..53001c1590 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -717,11 +717,26 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format != "pytorch":
+        if not cls._has_cuda_device():
+            return False
+        if not cls._is_linux():
+            return False
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            if "4" not in quantization:
+                return False
+        if llm_spec.model_format == "gptq":
+            if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
+                if not any(q in quantization for q in ("3", "4", "8")):
+                    return False
+            else:
+                if "4" not in quantization:
+                    return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
                 return False

From b099d8d30e2bdf6990099a3edfa4e954a27d4515 Mon Sep 17 00:00:00 2001
From: Yuhao Tsui <mail@cuiyuhao.com>
Date: Thu, 21 Nov 2024 14:33:18 +0800
Subject: [PATCH 2/4] Fix: Embedding Dimension err

---
 xinference/model/embedding/model_spec.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/embedding/model_spec.json b/xinference/model/embedding/model_spec.json
index dc8d851b85..14d4ced519 100644
--- a/xinference/model/embedding/model_spec.json
+++ b/xinference/model/embedding/model_spec.json
@@ -233,7 +233,7 @@
   },
   {
     "model_name": "gte-Qwen2",
-    "dimensions": 4096,
+    "dimensions": 3584,
     "max_tokens": 32000,
     "language": ["zh", "en"],
     "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",

From 4aa1c13d08616d94b02a16356ed9c4823a48a93c Mon Sep 17 00:00:00 2001
From: Yuhao Tsui <mail@cuiyuhao.com>
Date: Thu, 21 Nov 2024 14:34:02 +0800
Subject: [PATCH 3/4] Fix: Embedding Dimension err

---
 xinference/model/embedding/model_spec_modelscope.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/embedding/model_spec_modelscope.json b/xinference/model/embedding/model_spec_modelscope.json
index 7f4715b865..dee409be0a 100644
--- a/xinference/model/embedding/model_spec_modelscope.json
+++ b/xinference/model/embedding/model_spec_modelscope.json
@@ -235,7 +235,7 @@
   },
   {
     "model_name": "gte-Qwen2",
-    "dimensions": 4096,
+    "dimensions": 3584,
     "max_tokens": 32000,
     "language": ["zh", "en"],
     "model_id": "iic/gte_Qwen2-7B-instruct",

From 59bf7b674a7c7cbee370755475c7f696f4342ae0 Mon Sep 17 00:00:00 2001
From: Yuhao Tsui <mail@cuiyuhao.com>
Date: Thu, 21 Nov 2024 14:35:28 +0800
Subject: [PATCH 4/4] Fix: Embedding Dimension err

---
 doc/source/models/builtin/embedding/gte-qwen2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/models/builtin/embedding/gte-qwen2.rst b/doc/source/models/builtin/embedding/gte-qwen2.rst
index 85eeeac39a..cec2de89b6 100644
--- a/doc/source/models/builtin/embedding/gte-qwen2.rst
+++ b/doc/source/models/builtin/embedding/gte-qwen2.rst
@@ -11,7 +11,7 @@ gte-Qwen2
 Specifications
 ^^^^^^^^^^^^^^
 
-- **Dimensions:** 4096
+- **Dimensions:** 3584
 - **Max Tokens:** 32000
 - **Model ID:** Alibaba-NLP/gte-Qwen2-7B-instruct
 - **Model Hubs**: `Hugging Face <https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct>`__, `ModelScope <https://modelscope.cn/models/iic/gte_Qwen2-7B-instruct>`__