fix: rename backend and cleanup runtime [wip]

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
bentoml · aarnphm · Sep 1, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
commit f224e70108b4bf747c592013ee6efdf6f21eeae6
@@ -8,8 +8,7 @@ are backward compatible. We are more lenient with patch as the development can
 move quickly.
 
 If you are just using public API, then feel free to always upgrade. Whenever
-there is a breaking policies, it will become a `DeprecationWarning` with a
-period of 12 months before becoming broken.
+there is a breaking policies, it will be announced and will be broken.
 
 > [!WARNING]
 > Everything package under `openllm` that has an underscore prefixes

@@ -28,7 +28,7 @@
   import transformers
 
   from openllm_core._typing_compat import DictStrAny
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend
 
 logger = logging.getLogger(__name__)
 
@@ -125,7 +125,7 @@ def model_id(self) -> str:
       raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
 
   @property
-  def framework(self) -> LiteralRuntime:
+  def framework(self) -> LiteralBackend:
     try:
       return self._metadata['framework']
     except KeyError:

@@ -66,7 +66,7 @@ class GenerationConfig:
 from ._typing_compat import At
 from ._typing_compat import DictStrAny
 from ._typing_compat import ListStr
-from ._typing_compat import LiteralRuntime
+from ._typing_compat import LiteralBackend
 from ._typing_compat import LiteralString
 from ._typing_compat import NotRequired
 from ._typing_compat import Required
@@ -311,7 +311,7 @@ class GenerationConfig(ReprMixin):
   eta_cutoff: float = dantic.Field(
       0.0,
       description=
-      '''Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. '''
+      'Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. '
   )
   diversity_penalty: float = dantic.Field(
       0.0,
@@ -386,17 +386,17 @@ class GenerationConfig(ReprMixin):
   output_attentions: bool = dantic.Field(
       False,
       description=
-      '''Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'''
+      'Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'
   )
   output_hidden_states: bool = dantic.Field(
       False,
       description=
-      '''Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'''
+      'Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'
   )
   output_scores: bool = dantic.Field(
       False,
-      description=
-      '''Whether or not to return the prediction scores. See `scores` under returned tensors for more details.''')
+      description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.'
+  )
   pad_token_id: int = dantic.Field(description='The id of the *padding* token.')
   bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.')
   eos_token_id: t.Union[int, t.List[int]] = dantic.Field(
@@ -564,7 +564,7 @@ class ModelSettings(t.TypedDict, total=False):
   architecture: Required[str]
 
   # default OpenLLM runtime imlementation
-  default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralRuntime]]
+  default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]
 
   # meta
   url: str
@@ -575,7 +575,6 @@ class ModelSettings(t.TypedDict, total=False):
 
   # llm implementation specifics
   model_type: t.Literal['causal_lm', 'seq2seq_lm']
-  runtime: t.Literal['transformers', 'ggml']
 
   # naming convention, only name_type is needed to infer from the class
   # as the three below it can be determined automatically
@@ -595,7 +594,7 @@ class ModelSettings(t.TypedDict, total=False):
 
 _transformed_type: DictStrAny = {
     'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig],
-    'default_implementation': t.Dict[LiteralResourceSpec, LiteralRuntime]
+    'default_implementation': t.Dict[LiteralResourceSpec, LiteralBackend]
 }
 
 @attr.define(frozen=False,
@@ -639,23 +638,21 @@ def default(cls) -> _ModelSettingsAttr:
                       tokenizer_class=None,
                       timeout=int(36e6),
                       service_name='',
-                      workers_per_resource=1.,
-                      runtime='transformers')))
+                      workers_per_resource=1.)))
 
   # NOTE: The below are dynamically generated by the field_transformer
   if t.TYPE_CHECKING:
     # update-config-stubs.py: attrs start
     default_id: str
     model_ids: ListStr
     architecture: str
-    default_implementation: t.Dict[LiteralResourceSpec, LiteralRuntime]
+    default_implementation: t.Dict[LiteralResourceSpec, LiteralBackend]
     url: str
     requires_gpu: bool
     trust_remote_code: bool
     service_name: str
     requirements: t.Optional[ListStr]
     model_type: t.Literal['causal_lm', 'seq2seq_lm']
-    runtime: t.Literal['transformers', 'ggml']
     name_type: t.Optional[t.Literal['dasherize', 'lowercase']]
     model_name: str
     start_name: str
@@ -668,7 +665,7 @@ def default(cls) -> _ModelSettingsAttr:
 
 # a heuristic cascading implementation resolver based on available resources
 def get_default_implementation(
-    default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime:
+    default_implementation_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
   available_spec = available_resource_spec()
   if resource_spec('tpu') in available_spec: return default_implementation_mapping.get(resource_spec('tpu'), 'pt')
   elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt')
@@ -766,16 +763,16 @@ class _ConfigAttr:
 
   @staticmethod
   def Field(default: t.Any = None, **attrs: t.Any) -> t.Any:
-    return dantic.Field(default, **attrs)
+    '''Field is a alias to the internal dantic utilities to easily create
+      attrs.fields with pydantic-compatible interface. For example:
 
-  '''Field is a alias to the internal dantic utilities to easily create
-    attrs.fields with pydantic-compatible interface. For example:
-
-    ```python
-    class MyModelConfig(openllm.LLMConfig):
-        field1 = openllm.LLMConfig.Field(...)
-    ```
+      ```python
+      class MyModelConfig(openllm.LLMConfig):
+          field1 = openllm.LLMConfig.Field(...)
+      ```
     '''
+    return dantic.Field(default, **attrs)
+
   # NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
   if t.TYPE_CHECKING:
     # NOTE: public attributes to override
@@ -864,7 +861,7 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
             ```bash
             openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
             ```'''
-    __openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralRuntime] = Field(None)
+    __openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
     '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.
 
     It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
@@ -876,16 +873,11 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
     __openllm_trust_remote_code__: bool = Field(None)
     '''Whether to always trust remote code'''
     __openllm_service_name__: str = Field(None)
-    """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'"""
+    '''Generated service name for this LLMConfig. By default, it is "generated_{model_name}_service.py"'''
     __openllm_requirements__: t.Optional[ListStr] = Field(None)
-    '''The default PyPI requirements needed to run this given LLM. By default, we will depend on
-        bentoml, torch, transformers.'''
+    '''The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.'''
     __openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None)
-    '''The model type for this given LLM. By default, it should be causal language modeling.
-        Currently supported 'causal_lm' or 'seq2seq_lm'
-        '''
-    __openllm_runtime__: t.Literal['transformers', 'ggml'] = Field(None)
-    '''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.'''
+    '''The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"'''
     __openllm_name_type__: t.Optional[t.Literal['dasherize', 'lowercase']] = Field(None)
     '''The default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
@@ -1282,7 +1274,7 @@ def __getitem__(self, item: t.Literal['model_ids']) -> ListStr: ...
   @overload
   def __getitem__(self, item: t.Literal['architecture']) -> str: ...
   @overload
-  def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralRuntime]: ...
+  def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
   @overload
   def __getitem__(self, item: t.Literal['url']) -> str: ...
   @overload
@@ -1296,8 +1288,6 @@ def __getitem__(self, item: t.Literal['requirements']) -> t.Optional[ListStr]: .
   @overload
   def __getitem__(self, item: t.Literal['model_type']) -> t.Literal['causal_lm', 'seq2seq_lm']: ...
   @overload
-  def __getitem__(self, item: t.Literal['runtime']) -> t.Literal['transformers', 'ggml']: ...
-  @overload
   def __getitem__(self, item: t.Literal['name_type']) -> t.Optional[t.Literal['dasherize', 'lowercase']]: ...
   @overload
   def __getitem__(self, item: t.Literal['model_name']) -> str: ...
@@ -1650,7 +1640,7 @@ def peft_task_type(cls) -> str:
     return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]
 
   @classmethod
-  def default_implementation(cls) -> LiteralRuntime:
+  def default_implementation(cls) -> LiteralBackend:
     return first_not_none(cls.__openllm_env__['framework_value'],
                           default=get_default_implementation(cls.__openllm_default_implementation__))
 

@@ -44,7 +44,7 @@
 TupleAny = t.Tuple[t.Any, ...]
 At = t.TypeVar('At', bound=attr.AttrsInstance)
 
-LiteralRuntime = t.Literal['pt', 'tf', 'flax', 'vllm']
+LiteralBackend = t.Literal['pt', 'tf', 'flax', 'vllm', 'ggml', 'mlc']
 AdapterType = t.Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3']
 
 # TODO: support quay
@@ -96,7 +96,6 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
   SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
   __call__: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
-  set_adapter: RunnableMethod[LLMRunnable[M, T], [str], dict[t.Literal['success', 'error_msg'], bool | str]]
   embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput]
   generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
@@ -110,7 +109,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   identifying_params: dict[str, t.Any]
   llm: openllm.LLM[M, T]
   config: openllm.LLMConfig
-  implementation: LiteralRuntime
+  implementation: LiteralBackend
   supports_embeddings: bool
   supports_hf_agent: bool
   has_adapters: bool

@@ -48,14 +48,14 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
   treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
 
   Args:
-  tokenizer: the tokenizer
-  key: the key to convert to a single token
+    tokenizer: the tokenizer
+    key: the key to convert to a single token
 
   Raises:
-  RuntimeError: if more than one ID was generated
+    RuntimeError: if more than one ID was generated
 
   Returns:
-  int: the token ID for the given key.
+    int: the token ID for the given key.
   '''
   token_ids = tokenizer.encode(key)
   if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")

@@ -47,12 +47,12 @@
 try:
   from typing import GenericAlias as _TypingGenericAlias  # type: ignore
 except ImportError:
-  _TypingGenericAlias = (
-  )  # type: ignore  # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
+  # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
+  _TypingGenericAlias = ()  # type: ignore
 if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
 else:
-  _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType
-                          )  # type: ignore #  _GenericAlias is the actual GenericAlias implementation
+  #  _GenericAlias is the actual GenericAlias implementation
+  _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType)  # type: ignore
 
 DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG'
 
@@ -104,8 +104,8 @@ def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
   _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
   if not hasattr(obj, name): _setattr(name, value)
 
-def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
-  return '_'.join(filter(None, map(str.upper, ['OPENLLM', model_name, suffix.strip('_') if suffix else '', key])))
+def field_env_key(key: str, suffix: str | None = None) -> str:
+  return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key])))
 
 # Special debug flag controled via OPENLLMDEVDEBUG
 DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment) or (str(os.environ.get(

@@ -139,11 +139,8 @@ def identity(_: str, x_value: t.Any) -> t.Any:
       '__model_name': model_name,
   })
   lines: ListStr = [
-      '__env = lambda field_name: __field_env(__model_name, field_name, __suffix)', 'return [', '    f.evolve(',
-      '        default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),', '        metadata={',
-      "            'env': f.metadata.get('env', __env(f.name)),",
-      "            'description': f.metadata.get('description', '(not provided)'),", '        },', '    )',
-      '    for f in fields', ']'
+      '__env=lambda field_name:__field_env(field_name,__suffix)',
+      "return [f.evolve(default=__populate_env(__default_callback(f.name,f.default),__env(f.name)),metadata={'env':f.metadata.get('env',__env(f.name)),'description':f.metadata.get('description', '(not provided)')}) for f in fields]"
   ]
   fields_ann = 'list[attr.Attribute[t.Any]]'
   return generate_function(cls,

@@ -24,7 +24,7 @@
 
 if t.TYPE_CHECKING:
   BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend
 
 logger = logging.getLogger(__name__)
 OPTIONAL_DEPENDENCIES = {
@@ -337,7 +337,6 @@ class EnvVarMixin(ReprMixin):
   model_id: str
   quantize: str
   framework: str
-  runtime: str
 
   @overload
   def __getitem__(self, item: t.Literal['config']) -> str:
@@ -356,11 +355,7 @@ def __getitem__(self, item: t.Literal['framework']) -> str:
     ...
 
   @overload
-  def __getitem__(self, item: t.Literal['runtime']) -> str:
-    ...
-
-  @overload
-  def __getitem__(self, item: t.Literal['framework_value']) -> LiteralRuntime:
+  def __getitem__(self, item: t.Literal['framework_value']) -> LiteralBackend:
     ...
 
   @overload
@@ -371,52 +366,41 @@ def __getitem__(self, item: t.Literal['quantize_value']) -> t.Literal['int8', 'i
   def __getitem__(self, item: t.Literal['model_id_value']) -> str | None:
     ...
 
-  @overload
-  def __getitem__(self, item: t.Literal['runtime_value']) -> t.Literal['ggml', 'transformers']:
-    ...
-
   def __getitem__(self, item: str | t.Any) -> t.Any:
     if item.endswith('_value') and hasattr(self, f'_{item}'): return object.__getattribute__(self, f'_{item}')()
     elif hasattr(self, item): return getattr(self, item)
     raise KeyError(f'Key {item} not found in {self}')
 
   def __init__(self,
                model_name: str,
-               implementation: LiteralRuntime = 'pt',
+               implementation: LiteralBackend = 'pt',
                model_id: str | None = None,
-               quantize: LiteralString | None = None,
-               runtime: t.Literal['ggml', 'transformers'] = 'transformers') -> None:
+               quantize: LiteralString | None = None) -> None:
     '''EnvVarMixin is a mixin class that returns the value extracted from environment variables.'''
     from openllm_core.utils import field_env_key
     self.model_name = inflection.underscore(model_name)
     self._implementation = implementation
     self._model_id = model_id
     self._quantize = quantize
-    self._runtime = runtime
-    for att in {'config', 'model_id', 'quantize', 'framework', 'runtime'}:
+    for att in {'config', 'model_id', 'quantize', 'framework'}:
       setattr(self, att, field_env_key(self.model_name, att.upper()))
 
   def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
     from . import first_not_none
     return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']],
                   first_not_none(os.environ.get(self['quantize']), default=self._quantize))
 
-  def _framework_value(self) -> LiteralRuntime:
+  def _framework_value(self) -> LiteralBackend:
     from . import first_not_none
-    return t.cast(LiteralRuntime, first_not_none(os.environ.get(self['framework']), default=self._implementation))
+    return t.cast(LiteralBackend, first_not_none(os.environ.get(self['framework']), default=self._implementation))
 
   def _model_id_value(self) -> str | None:
     from . import first_not_none
     return first_not_none(os.environ.get(self['model_id']), default=self._model_id)
 
-  def _runtime_value(self) -> t.Literal['ggml', 'transformers']:
-    from . import first_not_none
-    return t.cast(t.Literal['ggml', 'transformers'],
-                  first_not_none(os.environ.get(self['runtime']), default=self._runtime))
-
   @property
   def __repr_keys__(self) -> set[str]:
-    return {'config', 'model_id', 'quantize', 'framework', 'runtime'}
+    return {'config', 'model_id', 'quantize', 'framework'}
 
   @property
   def start_docstring(self) -> str: