refactor: update naming and envvar

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
bentoml · aarnphm · Sep 1, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
commit 025633a90b3e55ade6f9e0f31075eede42ebbcd1
@@ -407,24 +407,29 @@ pip install "openllm[baichuan]"
 ### Runtime Implementations (Experimental)
 
 Different LLMs may have multiple runtime implementations. For instance, they
-might use Pytorch (`pt`), Tensorflow (`tf`), or Flax (`flax`).
+might use Pytorch (`pt`), Tensorflow (`tf`), Flax (`flax`) or vLLM (`vllm`).
 
 If you wish to specify a particular runtime for a model, you can do so by
-setting the `OPENLLM_{MODEL_NAME}_FRAMEWORK={runtime}` environment variable
+setting the `OPENLLM_BACKEND={runtime}` environment variable
 before running `openllm start`.
 
 For example, if you want to use the Tensorflow (`tf`) implementation for the
 `flan-t5` model, you can use the following command:
 
 ```bash
-OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
+OPENLLM_BACKEND=tf openllm start flan-t5
+
+openllm start flan-t5 --backend tf
 ```
 
 > [!NOTE]
 > For GPU support on Flax, refers to
 > [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)
 > to make sure that you have Jax support for the corresponding CUDA version.
 
+> [!IMPORTANT]
+> To use vLLM backend, at least a GPU with Ampere or newer architecture and CUDA 11.8 is required.
+
 ### Quantisation
 
 OpenLLM supports quantisation with

@@ -31,7 +31,6 @@ check-stubs = [
 inplace-changelog = "towncrier build --version main --keep"
 quality = [
     "./tools/dependencies.py",
-    "./tools/update-readme.py",
     "- ./tools/update-brew-tap.py",
     "bash ./tools/sync-readme.sh",
     "check-stubs",

@@ -98,7 +98,7 @@ def _hf_agent(self) -> transformers.HfAgent:
       raise RuntimeError(
           "transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
     if not self.supports_hf_agent:
-      raise RuntimeError(f'{self.model_name} ({self.framework}) does not support running HF agent.')
+      raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
     if not is_transformers_supports_agent():
       raise RuntimeError(
           "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'"
@@ -125,9 +125,9 @@ def model_id(self) -> str:
       raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
 
   @property
-  def framework(self) -> LiteralBackend:
+  def backend(self) -> LiteralBackend:
     try:
-      return self._metadata['framework']
+      return self._metadata['backend']
     except KeyError:
       raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
 

@@ -564,7 +564,7 @@ class ModelSettings(t.TypedDict, total=False):
   architecture: Required[str]
 
   # default OpenLLM runtime imlementation
-  default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]
+  default_backend: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]
 
   # meta
   url: str
@@ -594,7 +594,7 @@ class ModelSettings(t.TypedDict, total=False):
 
 _transformed_type: DictStrAny = {
     'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig],
-    'default_implementation': t.Dict[LiteralResourceSpec, LiteralBackend]
+    'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend]
 }
 
 @attr.define(frozen=False,
@@ -625,7 +625,7 @@ def default(cls) -> _ModelSettingsAttr:
         ModelSettings(default_id='__default__',
                       model_ids=['__default__'],
                       architecture='PreTrainedModel',
-                      default_implementation={
+                      default_backend={
                           'cpu': 'pt',
                           'nvidia.com/gpu': 'pt'
                       },
@@ -646,7 +646,7 @@ def default(cls) -> _ModelSettingsAttr:
     default_id: str
     model_ids: ListStr
     architecture: str
-    default_implementation: t.Dict[LiteralResourceSpec, LiteralBackend]
+    default_backend: t.Dict[LiteralResourceSpec, LiteralBackend]
     url: str
     requires_gpu: bool
     trust_remote_code: bool
@@ -664,15 +664,14 @@ def default(cls) -> _ModelSettingsAttr:
     # update-config-stubs.py: attrs stop
 
 # a heuristic cascading implementation resolver based on available resources
-def get_default_implementation(
-    default_implementation_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
+def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
   available_spec = available_resource_spec()
-  if resource_spec('tpu') in available_spec: return default_implementation_mapping.get(resource_spec('tpu'), 'pt')
-  elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt')
+  if resource_spec('tpu') in available_spec: return backend_mapping.get(resource_spec('tpu'), 'pt')
+  elif resource_spec('amd') in available_spec: return backend_mapping.get(resource_spec('amd'), 'pt')
   elif resource_spec('nvidia') in available_spec:
-    return default_implementation_mapping.get(resource_spec('nvidia'), 'pt')
+    return backend_mapping.get(resource_spec('nvidia'), 'pt')
   else:
-    return default_implementation_mapping.get(resource_spec('cpu'), 'pt')
+    return backend_mapping.get(resource_spec('cpu'), 'pt')
 
 def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr:
   if 'generation_class' in cl_.__config__:
@@ -698,14 +697,14 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
 
   model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name
   # if the default implementation dependencies doesn't exist, then always fallback to 'pt'
-  default_implementation = _settings_attr.default_implementation
-  for rs, runtime in default_implementation.items():
+  default_backend = _settings_attr.default_backend
+  for rs, runtime in default_backend.items():
     library_stub = 'torch' if runtime == 'pt' else runtime
-    if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = 'pt'
-  _final_value_dct['default_implementation'] = default_implementation
+    if not BACKENDS_MAPPING[library_stub][0](): default_backend[rs] = 'pt'
+  _final_value_dct['default_backend'] = default_backend
 
   env = openllm_core.utils.EnvVarMixin(model_name,
-                                       get_default_implementation(default_implementation),
+                                       backend=get_default_backend(default_backend),
                                        model_id=_settings_attr.default_id)
   _final_value_dct['env'] = env
 
@@ -861,11 +860,8 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
             ```bash
             openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
             ```'''
-    __openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
-    '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.
-
-    It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
-    '''
+    __openllm_default_backend__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
+    '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')'''
     __openllm_url__: str = Field(None)
     '''The resolved url for this LLMConfig.'''
     __openllm_requires_gpu__: bool = Field(None)
@@ -1193,8 +1189,8 @@ def _make_subclass(class_attr: str,
       annotated_names.add(attr_name)
       val = cd.get(attr_name, attr.NOTHING)
       if not isinstance(val, _CountingAttr):
-        if val is attr.NOTHING: val = cls.Field(env=field_env_key(cls.__openllm_model_name__, attr_name))
-        else: val = cls.Field(default=val, env=field_env_key(cls.__openllm_model_name__, attr_name))
+        if val is attr.NOTHING: val = cls.Field(env=field_env_key(attr_name))
+        else: val = cls.Field(default=val, env=field_env_key(attr_name))
       these[attr_name] = val
     unannotated = ca_names - annotated_names
     if len(unannotated) > 0:
@@ -1274,7 +1270,7 @@ def __getitem__(self, item: t.Literal['model_ids']) -> ListStr: ...
   @overload
   def __getitem__(self, item: t.Literal['architecture']) -> str: ...
   @overload
-  def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
+  def __getitem__(self, item: t.Literal['default_backend']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
   @overload
   def __getitem__(self, item: t.Literal['url']) -> str: ...
   @overload
@@ -1640,9 +1636,9 @@ def peft_task_type(cls) -> str:
     return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]
 
   @classmethod
-  def default_implementation(cls) -> LiteralBackend:
-    return first_not_none(cls.__openllm_env__['framework_value'],
-                          default=get_default_implementation(cls.__openllm_default_implementation__))
+  def default_backend(cls) -> LiteralBackend:
+    return first_not_none(cls.__openllm_env__['backend_value'],
+                          default=get_default_backend(cls.__openllm_default_backend__))
 
   def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
     '''This handler will sanitize all attrs and setup prompt text.

@@ -77,7 +77,7 @@ class MetadataOutput:
   model_id: str
   timeout: int
   model_name: str
-  framework: str
+  backend: str
   configuration: str
   supports_embeddings: bool
   supports_hf_agent: bool

@@ -37,6 +37,9 @@
     't.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]'
 )
 
+def get_literal_args(typ: t.Any) -> tuple[str, ...]:
+  return getattr(typ, '__args__')
+
 AnyCallable = t.Callable[..., t.Any]
 DictStrAny = t.Dict[str, t.Any]
 ListAny = t.List[t.Any]
@@ -109,7 +112,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   identifying_params: dict[str, t.Any]
   llm: openllm.LLM[M, T]
   config: openllm.LLMConfig
-  implementation: LiteralBackend
+  backend: LiteralBackend
   supports_embeddings: bool
   supports_hf_agent: bool
   has_adapters: bool

@@ -17,14 +17,14 @@
 By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
 
 \b
-- To use Flax, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="flax"``
+- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"``
 
 \b
-- To use Tensorflow, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="tf"``
+- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"``
 
 \b
 FLAN-T5 Runner will use google/flan-t5-large as the default model. To change to any other FLAN-T5
-saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
+saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_MODEL_ID='google/flan-t5-xxl'``
 or provide `--model-id` flag when running ``openllm start flan-t5``:
 
 \b

@@ -19,11 +19,14 @@
 This model will also supports PyTorch.
 
 \b
-- To use PyTorch, set the environment variable ``OPENLLM_LLAMA_FRAMEWORK="pt"``
+- To use PyTorch, set the environment variable ``OPENLLM_BACKEND="pt"``
+
+\b
+- To use vLLM, set the environment variable ``OPENLLM_BACKEND="vllm"``
 
 \b
 Llama Runner will use decapoda-research/llama-7b-hf as the default model. To change to any other Llama
-saved pretrained, or a fine-tune Llama, provide ``OPENLLM_LLAMA_MODEL_ID='openlm-research/open_llama_7b_v2'``
+saved pretrained, or a fine-tune Llama, provide ``OPENLLM_MODEL_ID='openlm-research/open_llama_7b_v2'``
 or provide `--model-id` flag when running ``openllm start llama``:
 
 \b
@@ -70,7 +73,7 @@ class LlamaConfig(openllm_core.LLMConfig):
           'lowercase',
       'url':
           'https://github.com/facebookresearch/llama',
-      'default_implementation': {
+      'default_backend': {
           'cpu': 'pt',
           'nvidia.com/gpu': 'pt'
       },

@@ -18,14 +18,14 @@
 By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
 
 \b
-- To use Flax, set the environment variable ``OPENLLM_OPT_FRAMEWORK="flax"``
+- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"``
 
 \b
-- To use Tensorflow, set the environment variable ``OPENLLM_OPT_FRAMEWORK="tf"``
+- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"``
 
 \b
 OPT Runner will use facebook/opt-2.7b as the default model. To change to any other OPT
-saved pretrained, or a fine-tune OPT, provide ``OPENLLM_OPT_MODEL_ID='facebook/opt-6.7b'``
+saved pretrained, or a fine-tune OPT, provide ``OPENLLM_MODEL_ID='facebook/opt-6.7b'``
 or provide `--model-id` flag when running ``openllm start opt``:
 
 \b

@@ -96,6 +96,9 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1'
 def device_count() -> int:
   return len(available_devices())
 
+def check_bool_env(env: str, default: bool = True) -> bool:
+  return os.environ.get(env, str(default)).upper() in ENV_VARS_TRUE_VALUES
+
 # equivocal setattr to save one lookup per assignment
 _object_setattr = object.__setattr__
 
@@ -108,10 +111,10 @@ def field_env_key(key: str, suffix: str | None = None) -> str:
   return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key])))
 
 # Special debug flag controled via OPENLLMDEVDEBUG
-DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment) or (str(os.environ.get(
-    DEV_DEBUG_VAR, None)).upper() in ENV_VARS_TRUE_VALUES)
+DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False))
 # Whether to show the codenge for debug purposes
-SHOW_CODEGEN: bool = DEBUG and int(os.environ.get('OPENLLMDEVDEBUG', str(0))) > 3
+SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and
+                                int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3)
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY = False
 
@@ -195,6 +198,7 @@ def configure_logging() -> None:
     _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.ERROR
     _LOGGING_CONFIG['root']['level'] = logging.ERROR
   elif get_debug_mode() or DEBUG:
+    _LOGGING_CONFIG['handlers']['defaulthandler']['level'] = logging.DEBUG
     _LOGGING_CONFIG['loggers']['openllm']['level'] = logging.DEBUG
     _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.DEBUG
     _LOGGING_CONFIG['root']['level'] = logging.DEBUG
@@ -332,8 +336,8 @@ def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t
     'analytics': [],
     'codegen': [],
     'dantic': [],
+    'lazy': [],
     'representation': ['ReprMixin'],
-    'lazy': ['LazyModule'],
     'import_utils': [
         'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available',
         'is_einops_available', 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available',

@@ -24,11 +24,10 @@
 
 # This variable is a proxy that will control BENTOML_DO_NOT_TRACK
 OPENLLM_DO_NOT_TRACK = 'OPENLLM_DO_NOT_TRACK'
-DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()
 
 @functools.lru_cache(maxsize=1)
 def do_not_track() -> bool:
-  return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
+  return openllm_core.utils.check_bool_env(OPENLLM_DO_NOT_TRACK)
 
 @functools.lru_cache(maxsize=1)
 def _usage_event_debugging() -> bool: