Support for Request accessed GenAI Models (onyx-dot-app#270)

itsnotvalid · Aug 7, 2023 · 3bfc724 · 3bfc724
1 parent 0e667d3
commit 3bfc724
Show file tree

Hide file tree

Showing 19 changed files with 613 additions and 351 deletions.
diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py
@@ -138,12 +138,6 @@
 CHUNK_MAX_CHAR_OVERLAP = 50
 
 
-#####
-# Other API Keys
-#####
-OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
-
-
 #####
 # Encoder Model Endpoint Configs (Currently unused, running the models in memory)
 #####

diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
@@ -12,7 +12,7 @@
 ALLOWED_USERS = "allowed_users"
 ALLOWED_GROUPS = "allowed_groups"
 METADATA = "metadata"
-OPENAI_API_KEY_STORAGE_KEY = "openai_api_key"
+GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
 HTML_SEPARATOR = "\n"
 PUBLIC_DOC_PAT = "PUBLIC"
 
@@ -30,3 +30,26 @@ class DocumentSource(str, Enum):
     PRODUCTBOARD = "productboard"
     FILE = "file"
     NOTION = "notion"
+
+
+class DanswerGenAIModel(str, Enum):
+    """This represents the internal Danswer GenAI model which determines the class that is used
+    to generate responses to the user query. Different models/services require different internal
+    handling, this allows for modularity of implementation within Danswer"""
+
+    OPENAI = "openai-completion"
+    OPENAI_CHAT = "openai-chat-completion"
+    GPT4ALL = "gpt4all-completion"
+    GPT4ALL_CHAT = "gpt4all-chat-completion"
+    HUGGINGFACE = "huggingface-inference-completion"
+    HUGGINGFACE_CHAT = "huggingface-inference-chat-completion"
+    REQUEST = "request-completion"
+
+
+class ModelHostType(str, Enum):
+    """For GenAI models interfaced via requests, different services have different
+    expectations for what fields are included in the request"""
+
+    # https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
+    HUGGINGFACE = "huggingface"  # HuggingFace test-generation Inference API
+    # TODO support for Azure, AWS, GCP GenAI model hosting
diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py
@@ -1,4 +1,8 @@
 import os
+from enum import Enum
+
+from danswer.configs.constants import DanswerGenAIModel
+from danswer.configs.constants import ModelHostType
 
 # Important considerations when choosing models
 # Max tokens count needs to be high considering use case (at least 512)
@@ -30,35 +34,46 @@
 # Purely an optimization, memory limitation consideration
 BATCH_SIZE_ENCODE_CHUNKS = 8
 
-# QA Model API Configs
-# refer to https://platform.openai.com/docs/models/model-endpoint-compatibility for OpenAI models
-# Valid list:
-# - openai-completion
-# - openai-chat-completion
-# - gpt4all-completion -> Due to M1 Macs not having compatible gpt4all version, please install dependency yourself
-# - gpt4all-chat-completion-> Due to M1 Macs not having compatible gpt4all version, please install dependency yourself
-# To use gpt4all, run: pip install --upgrade gpt4all==1.0.5
-# These support HuggingFace Inference API, Inference Endpoints and servers running the text-generation-inference backend
-# - huggingface-inference-completion
-# - huggingface-inference-chat-completion
 
+#####
+# Generative AI Model Configs
+#####
+# Other models should work as well, check the library/API compatibility.
+# But these are the models that have been verified to work with the existing prompts.
+# Using a different model may require some prompt tuning. See qa_prompts.py
+VERIFIED_MODELS = {
+    DanswerGenAIModel.OPENAI: ["text-davinci-003"],
+    DanswerGenAIModel.OPENAI_CHAT: ["gpt-3.5-turbo", "gpt-4"],
+    DanswerGenAIModel.GPT4ALL: ["ggml-model-gpt4all-falcon-q4_0.bin"],
+    DanswerGenAIModel.GPT4ALL_CHAT: ["ggml-model-gpt4all-falcon-q4_0.bin"],
+    # The "chat" model below is actually "instruction finetuned" and does not support conversational
+    DanswerGenAIModel.HUGGINGFACE.value: ["meta-llama/Llama-2-70b-chat-hf"],
+    DanswerGenAIModel.HUGGINGFACE_CHAT.value: ["meta-llama/Llama-2-70b-hf"],
+}
+
+# Sets the internal Danswer model class to use
 INTERNAL_MODEL_VERSION = os.environ.get(
-    "INTERNAL_MODEL_VERSION", "openai-chat-completion"
+    "INTERNAL_MODEL_VERSION", DanswerGenAIModel.OPENAI_CHAT.value
 )
-# For GPT4ALL, use "ggml-model-gpt4all-falcon-q4_0.bin" for the below for a tested model
-GEN_AI_MODEL_VERSION = os.environ.get("GEN_AI_MODEL_VERSION", "gpt-3.5-turbo")
-GEN_AI_MAX_OUTPUT_TOKENS = int(os.environ.get("GEN_AI_MAX_OUTPUT_TOKENS", "512"))
-# Use HuggingFace API Token for Huggingface inference client
-GEN_AI_HUGGINGFACE_API_TOKEN = os.environ.get("GEN_AI_HUGGINGFACE_API_TOKEN", None)
-# Use the conversational API with the huggingface-inference-chat-completion internal model
-# Note - this only works with models that support conversational interfaces
-GEN_AI_HUGGINGFACE_USE_CONVERSATIONAL = (
-    os.environ.get("GEN_AI_HUGGINGFACE_USE_CONVERSATIONAL", "").lower() == "true"
-)
-# Disable streaming responses. Set this to true to "polyfill" streaming for models that don't support streaming
-GEN_AI_HUGGINGFACE_DISABLE_STREAM = (
-    os.environ.get("GEN_AI_HUGGINGFACE_DISABLE_STREAM", "").lower() == "true"
+
+# If the Generative AI model requires an API key for access, otherwise can leave blank
+GEN_AI_API_KEY = os.environ.get("GEN_AI_API_KEY", "")
+
+# If using GPT4All or OpenAI, specify the model version
+GEN_AI_MODEL_VERSION = os.environ.get(
+    "GEN_AI_MODEL_VERSION",
+    VERIFIED_MODELS.get(DanswerGenAIModel(INTERNAL_MODEL_VERSION), [""])[0],
 )
 
+# If the Generative Model is hosted to accept requests (DanswerGenAIModel.REQUEST) then
+# set the two below to specify
+# - Where to hit the endpoint
+# - How should the request be formed
+GEN_AI_ENDPOINT = os.environ.get("GEN_AI_ENDPOINT", "")
+GEN_AI_HOST_TYPE = os.environ.get("GEN_AI_HOST_TYPE", ModelHostType.HUGGINGFACE.value)
+
+# Set this to be enough for an answer + quotes
+GEN_AI_MAX_OUTPUT_TOKENS = int(os.environ.get("GEN_AI_MAX_OUTPUT_TOKENS", "512"))
+
 # Danswer custom Deep Learning Models
 INTENT_MODEL_VERSION = "danswer/intent-model"
diff --git a/backend/danswer/direct_qa/__init__.py b/backend/danswer/direct_qa/__init__.py
@@ -1,25 +1,30 @@
 from typing import Any
 
+import pkg_resources
 from openai.error import AuthenticationError
-from openai.error import Timeout
 
 from danswer.configs.app_configs import QA_TIMEOUT
-from danswer.configs.model_configs import (
-    GEN_AI_HUGGINGFACE_API_TOKEN,
-    INTERNAL_MODEL_VERSION,
-)
+from danswer.configs.constants import DanswerGenAIModel
+from danswer.configs.constants import ModelHostType
+from danswer.configs.model_configs import GEN_AI_API_KEY
+from danswer.configs.model_configs import GEN_AI_ENDPOINT
+from danswer.configs.model_configs import GEN_AI_HOST_TYPE
+from danswer.configs.model_configs import INTERNAL_MODEL_VERSION
 from danswer.direct_qa.exceptions import UnknownModelError
+from danswer.direct_qa.gpt_4_all import GPT4AllChatCompletionQA
+from danswer.direct_qa.gpt_4_all import GPT4AllCompletionQA
+from danswer.direct_qa.huggingface import HuggingFaceChatCompletionQA
+from danswer.direct_qa.huggingface import HuggingFaceCompletionQA
 from danswer.direct_qa.interfaces import QAModel
-from danswer.direct_qa.huggingface_inference import (
-    HuggingFaceInferenceChatCompletionQA,
-    HuggingFaceInferenceCompletionQA,
-)
 from danswer.direct_qa.open_ai import OpenAIChatCompletionQA
 from danswer.direct_qa.open_ai import OpenAICompletionQA
+from danswer.direct_qa.qa_prompts import WeakModelFreeformProcessor
+from danswer.direct_qa.qa_utils import get_gen_ai_api_key
+from danswer.direct_qa.request_model import RequestCompletionQA
+from danswer.dynamic_configs.interface import ConfigNotFoundError
+from danswer.utils.logger import setup_logger
 
-# Imports commented out temporarily due to incompatibility of gpt4all with M1 Mac hardware currently
-# from danswer.direct_qa.gpt_4_all import GPT4AllChatCompletionQA
-# from danswer.direct_qa.gpt_4_all import GPT4AllCompletionQA
+logger = setup_logger()
 
 
 def check_model_api_key_is_valid(model_api_key: str) -> bool:
@@ -35,32 +40,66 @@ def check_model_api_key_is_valid(model_api_key: str) -> bool:
             return True
         except AuthenticationError:
             return False
-        except Timeout:
-            pass
+        except Exception as e:
+            logger.warning(f"GenAI API key failed for the following reason: {e}")
 
     return False
 
 
 def get_default_backend_qa_model(
     internal_model: str = INTERNAL_MODEL_VERSION,
-    api_key: str | None = None,
+    endpoint: str | None = GEN_AI_ENDPOINT,
+    model_host_type: str | None = GEN_AI_HOST_TYPE,
+    api_key: str | None = GEN_AI_API_KEY,
     timeout: int = QA_TIMEOUT,
-    **kwargs: Any
+    **kwargs: Any,
 ) -> QAModel:
-    if internal_model == "openai-completion":
+    if not api_key:
+        try:
+            api_key = get_gen_ai_api_key()
+        except ConfigNotFoundError:
+            pass
+
+    if internal_model in [
+        DanswerGenAIModel.GPT4ALL.value,
+        DanswerGenAIModel.GPT4ALL_CHAT.value,
+    ]:
+        # gpt4all is not compatible M1 Mac hardware as of Aug 2023
+        pkg_resources.get_distribution("gpt4all")
+
+    if internal_model == DanswerGenAIModel.OPENAI.value:
         return OpenAICompletionQA(timeout=timeout, api_key=api_key, **kwargs)
-    elif internal_model == "openai-chat-completion":
+    elif internal_model == DanswerGenAIModel.OPENAI_CHAT.value:
         return OpenAIChatCompletionQA(timeout=timeout, api_key=api_key, **kwargs)
-    elif internal_model == "huggingface-inference-completion":
-        api_key = api_key if api_key is not None else GEN_AI_HUGGINGFACE_API_TOKEN
-        return HuggingFaceInferenceCompletionQA(api_key=api_key, **kwargs)
-    elif internal_model == "huggingface-inference-chat-completion":
-        api_key = api_key if api_key is not None else GEN_AI_HUGGINGFACE_API_TOKEN
-        return HuggingFaceInferenceChatCompletionQA(api_key=api_key, **kwargs)
-    # Note GPT4All is not supported for M1 Mac machines currently, removing until support is added
-    # elif internal_model == "gpt4all-completion":
-    #    return GPT4AllCompletionQA(**kwargs)
-    # elif internal_model == "gpt4all-chat-completion":
-    #    return GPT4AllChatCompletionQA(**kwargs)
+    elif internal_model == DanswerGenAIModel.GPT4ALL.value:
+        return GPT4AllCompletionQA(**kwargs)
+    elif internal_model == DanswerGenAIModel.GPT4ALL_CHAT.value:
+        return GPT4AllChatCompletionQA(**kwargs)
+    elif internal_model == DanswerGenAIModel.HUGGINGFACE.value:
+        return HuggingFaceCompletionQA(api_key=api_key, **kwargs)
+    elif internal_model == DanswerGenAIModel.HUGGINGFACE_CHAT.value:
+        return HuggingFaceChatCompletionQA(api_key=api_key, **kwargs)
+    elif internal_model == DanswerGenAIModel.REQUEST.value:
+        if endpoint is None or model_host_type is None:
+            raise ValueError(
+                "Request based GenAI model requires an endpoint and host type"
+            )
+        if model_host_type == ModelHostType.HUGGINGFACE.value:
+            # Assuming user is hosting the smallest size LLMs with weaker capabilities and token limits
+            # With the 7B Llama2 Chat model, there is a max limit of 1512 tokens
+            # This is the sum of input and output tokens, so cannot take in full Danswer context
+            return RequestCompletionQA(
+                endpoint=endpoint,
+                model_host_type=model_host_type,
+                api_key=api_key,
+                prompt_processor=WeakModelFreeformProcessor(),
+                timeout=timeout,
+            )
+        return RequestCompletionQA(
+            endpoint=endpoint,
+            model_host_type=model_host_type,
+            api_key=api_key,
+            timeout=timeout,
+        )
     else:
         raise UnknownModelError(internal_model)
diff --git a/backend/danswer/direct_qa/gpt_4_all.py b/backend/danswer/direct_qa/gpt_4_all.py
@@ -1,8 +1,6 @@
 from collections.abc import Generator
 from typing import Any
 
-from gpt4all import GPT4All  # type:ignore
-
 from danswer.chunking.models import InferenceChunk
 from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
 from danswer.configs.model_configs import GEN_AI_MODEL_VERSION
@@ -18,9 +16,30 @@
 from danswer.utils.logger import setup_logger
 from danswer.utils.timing import log_function_time
 
-
 logger = setup_logger()
 
+
+class DummyGPT4All:
+    """In the case of import failure due to M1 Mac incompatibility,
+    so this module does not raise exceptions during server startup,
+    as long as this module isn't actually used"""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        raise RuntimeError("GPT4All library not installed.")
+
+
+try:
+    from gpt4all import GPT4All  # type:ignore
+except ImportError:
+    logger.warning(
+        "GPT4All library not installed. "
+        "If you wish to run GPT4ALL (in memory) to power Danswer's "
+        "Generative AI features, please install gpt4all==1.0.5. "
+        "As of Aug 2023, this library is not compatible with M1 Mac."
+    )
+    GPT4All = DummyGPT4All
+
+
 GPT4ALL_MODEL: GPT4All | None = None
 
 
@@ -56,6 +75,10 @@ def __init__(
         self.max_output_tokens = max_output_tokens
         self.include_metadata = include_metadata
 
+    @property
+    def requires_api_key(self) -> bool:
+        return False
+
     def warm_up_model(self) -> None:
         get_gpt_4_all_model(self.model_version)
 
@@ -117,6 +140,13 @@ def __init__(
         self.max_output_tokens = max_output_tokens
         self.include_metadata = include_metadata
 
+    @property
+    def requires_api_key(self) -> bool:
+        return False
+
+    def warm_up_model(self) -> None:
+        get_gpt_4_all_model(self.model_version)
+
     @log_function_time()
     def answer_question(
         self, query: str, context_docs: list[InferenceChunk]