chore: remove bettertransformer

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
bentoml · aarnphm · Sep 1, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
commit 939a1dc49abbf1c6ebf66dd39f6d539b5e00bc0a
@@ -721,9 +721,6 @@ No significant changes.
   `openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ`
   quantization support is on the roadmap and currently being worked on.
 
-  `openllm start` now also support `--bettertransformer` to use
-  `BetterTransformer` for serving.
-
   Refactored `openllm.LLMConfig` to be able to use with `__getitem__`:
   `openllm.DollyV2Config()['requirements']`.
 
@@ -732,8 +729,6 @@ No significant changes.
 
   Added `towncrier` workflow to easily generate changelog entries
 
-  Added `use_pipeline`, `bettertransformer` flag into ModelSettings
-
   `LLMConfig` now supported `__dataclass_transform__` protocol to help with
   type-checking
 

@@ -31,10 +31,10 @@
 (s/def ::model_id (s/coll-of string? :kind vector?))                   ;; model_id is a vector of all models for a given model_type
 (s/def ::url string?)                                                  ;; url to the model's page
 (s/def ::requires_gpu boolean?)                                        ;; whether the model requires a gpu
-(s/def ::runtime_impl ::vec-of-runtimes?)                              ;; supported runtimes
+(s/def ::backend ::vec-of-runtimes?)                                   ;; supported runtimes
 (s/def ::installation string?)                                         ;; installation instructions (pip command)
 (s/def ::model-spec (s/keys :req-un [::model_id ::url ::requires_gpu   ;; the spec for a single model (aggregates all the above)
-                                     ::runtime_impl ::installation]))
+                                     ::backend ::installation]))
 (s/def ::all-models #(or loading-text                                  ;; -- this is the case when the file with the model data has not been loaded yet by the ::set-model-data effect
                          (s/map-of keyword? ::model-spec)))            ;; map of all models
 

@@ -73,7 +73,6 @@ class GenerationConfig:
 from ._typing_compat import Self
 from ._typing_compat import overload
 from .exceptions import ForbiddenAttributeError
-from .utils import ENV_VARS_TRUE_VALUES
 from .utils import MYPY
 from .utils import LazyLoader
 from .utils import ReprMixin
@@ -478,7 +477,6 @@ class ModelSettings(t.TypedDict, total=False):
   requirements: t.Optional[ListStr]
 
   # llm implementation specifics
-  bettertransformer: bool
   model_type: t.Literal['causal_lm', 'seq2seq_lm']
   runtime: t.Literal['transformers', 'ggml']
 
@@ -565,7 +563,6 @@ def default(cls) -> _ModelSettingsAttr:
     trust_remote_code: bool
     service_name: str
     requirements: t.Optional[ListStr]
-    bettertransformer: bool
     model_type: t.Literal['causal_lm', 'seq2seq_lm']
     runtime: t.Literal['transformers', 'ggml']
     name_type: t.Optional[t.Literal['dasherize', 'lowercase']]
@@ -610,13 +607,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
     if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = 'pt'
   _final_value_dct['default_implementation'] = default_implementation
 
-  env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
+  env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id)
   _final_value_dct['env'] = env
 
-  # bettertransformer support
-  if _settings_attr['bettertransformer'] is None: _final_value_dct['bettertransformer'] = str(env['bettertransformer_value']).upper() in ENV_VARS_TRUE_VALUES
-  # if requires_gpu is True, then disable BetterTransformer for quantization.
-  if _settings_attr['requires_gpu']: _final_value_dct['bettertransformer'] = False
   _final_value_dct['service_name'] = f'generated_{model_name}_service.py'
 
   # NOTE: The key for fine-tune strategies is 'fine_tune_strategies'
@@ -771,8 +764,6 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
     __openllm_requirements__: t.Optional[ListStr] = Field(None)
     '''The default PyPI requirements needed to run this given LLM. By default, we will depend on
         bentoml, torch, transformers.'''
-    __openllm_bettertransformer__: bool = Field(None)
-    '''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.'''
     __openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None)
     '''The model type for this given LLM. By default, it should be causal language modeling.
         Currently supported 'causal_lm' or 'seq2seq_lm'
@@ -1149,8 +1140,6 @@ def __getitem__(self, item: t.Literal['service_name']) -> str: ...
   @overload
   def __getitem__(self, item: t.Literal['requirements']) -> t.Optional[ListStr]: ...
   @overload
-  def __getitem__(self, item: t.Literal['bettertransformer']) -> bool: ...
-  @overload
   def __getitem__(self, item: t.Literal['model_type']) -> t.Literal['causal_lm', 'seq2seq_lm']: ...
   @overload
   def __getitem__(self, item: t.Literal['runtime']) -> t.Literal['transformers', 'ggml']: ...

@@ -94,7 +94,7 @@ def _from_system(cls: type[DynResource]) -> list[str]:
   if visible_devices is None:
     if cls.resource_id == 'amd.com/gpu':
       if not psutil.LINUX:
-        if DEBUG: warnings.warn('AMD GPUs is currently only supported on Linux.', stacklevel=_STACK_LEVEL)
+        if DEBUG: logger.debug('AMD GPUs is currently only supported on Linux.')
         return []
       # ROCm does not currently have the rocm_smi wheel.
       # So we need to use the ctypes bindings directly.

@@ -102,7 +102,6 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   __module__: str
   llm_type: str
   llm_tag: bentoml.Tag
-  llm_framework: LiteralRuntime
   identifying_params: dict[str, t.Any]
   llm: openllm.LLM[M, T]
   config: openllm.LLMConfig

@@ -334,7 +334,6 @@ class EnvVarMixin(ReprMixin):
   model_id: str
   quantize: str
   framework: str
-  bettertransformer: str
   runtime: str
 
   @overload
@@ -353,10 +352,6 @@ def __getitem__(self, item: t.Literal['quantize']) -> str:
   def __getitem__(self, item: t.Literal['framework']) -> str:
     ...
 
-  @overload
-  def __getitem__(self, item: t.Literal['bettertransformer']) -> str:
-    ...
-
   @overload
   def __getitem__(self, item: t.Literal['runtime']) -> str:
     ...
@@ -373,10 +368,6 @@ def __getitem__(self, item: t.Literal['quantize_value']) -> t.Literal['int8', 'i
   def __getitem__(self, item: t.Literal['model_id_value']) -> str | None:
     ...
 
-  @overload
-  def __getitem__(self, item: t.Literal['bettertransformer_value']) -> bool:
-    ...
-
   @overload
   def __getitem__(self, item: t.Literal['runtime_value']) -> t.Literal['ggml', 'transformers']:
     ...
@@ -391,7 +382,6 @@ def __init__(
       model_name: str,
       implementation: LiteralRuntime = 'pt',
       model_id: str | None = None,
-      bettertransformer: bool | None = None,
       quantize: LiteralString | None = None,
       runtime: t.Literal['ggml', 'transformers'] = 'transformers'
   ) -> None:
@@ -400,10 +390,9 @@ def __init__(
     self.model_name = inflection.underscore(model_name)
     self._implementation = implementation
     self._model_id = model_id
-    self._bettertransformer = bettertransformer
     self._quantize = quantize
     self._runtime = runtime
-    for att in {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}:
+    for att in {'config', 'model_id', 'quantize', 'framework', 'runtime'}:
       setattr(self, att, field_env_key(self.model_name, att.upper()))
 
   def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
@@ -414,10 +403,6 @@ def _framework_value(self) -> LiteralRuntime:
     from . import first_not_none
     return t.cast(LiteralRuntime, first_not_none(os.environ.get(self['framework']), default=self._implementation))
 
-  def _bettertransformer_value(self) -> bool:
-    from . import first_not_none
-    return t.cast(bool, first_not_none(os.environ.get(self['bettertransformer'], str(False)).upper() in ENV_VARS_TRUE_VALUES, default=self._bettertransformer))
-
   def _model_id_value(self) -> str | None:
     from . import first_not_none
     return first_not_none(os.environ.get(self['model_id']), default=self._model_id)
@@ -428,7 +413,7 @@ def _runtime_value(self) -> t.Literal['ggml', 'transformers']:
 
   @property
   def __repr_keys__(self) -> set[str]:
-    return {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}
+    return {'config', 'model_id', 'quantize', 'framework', 'runtime'}
 
   @property
   def start_docstring(self) -> str:

@@ -53,7 +53,6 @@
 from openllm_core.utils import generate_hash_from_file
 from openllm_core.utils import is_peft_available
 from openllm_core.utils import is_torch_available
-from openllm_core.utils import non_intrusive_setattr
 from openllm_core.utils import normalize_attrs_to_model_tokenizer_pair
 from openllm_core.utils import resolve_filepath
 from openllm_core.utils import validate_is_path
@@ -66,7 +65,6 @@
 from .utils import infer_auto_class
 
 if t.TYPE_CHECKING:
-  import pathlib
 
   import auto_gptq as autogptq
   import peft
@@ -214,15 +212,6 @@ def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> T:
     '''
     raise NotImplementedError
 
-  def save_pretrained(self, save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
-    '''This function defines how this model can be saved to local store.
-
-    This will be called during ``import_model``. By default, it will use ``openllm.serialisation.save_pretrained``.
-    Additionally, the function signature are similar to ``transformers.PreTrainedModel.save_pretrained``
-    This is useful during fine tuning.
-    '''
-    raise NotImplementedError
-
 class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC):
   def llm_post_init(self) -> None:
     '''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.
@@ -255,13 +244,6 @@ def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
   # NOTE: All fields below are attributes that can be accessed by users.
   config_class: t.Type[LLMConfig]
   '''The config class to use for this LLM. If you are creating a custom LLM, you must specify this class.'''
-  bettertransformer: bool
-  '''Whether to load this LLM with FasterTransformer enabled. The order of loading is:
-
-    - If pass within `for_model`, `from_pretrained` or `__init__`, Default to self.config['bettertransformer']
-
-    > [!NOTE] that if LoRA is enabled, bettertransformer will be disabled.
-    '''
   device: 'torch.device'
   '''The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string.'''
   tokenizer_id: t.Union[t.Literal['local'], LiteralString]
@@ -422,7 +404,6 @@ def from_pretrained(
       *args: t.Any,
       runtime: t.Literal['ggml', 'transformers'] | None = ...,
       quantize: t.Literal['int8', 'int4'] = ...,
-      bettertransformer: str | bool | None = ...,
       adapter_id: str | None = ...,
       adapter_name: str | None = ...,
       adapter_map: dict[str, str | None] | None = ...,
@@ -442,7 +423,6 @@ def from_pretrained(
       *args: t.Any,
       runtime: t.Literal['ggml', 'transformers'] | None = ...,
       quantize: t.Literal['gptq'] = ...,
-      bettertransformer: str | bool | None = ...,
       adapter_id: str | None = ...,
       adapter_name: str | None = ...,
       adapter_map: dict[str, str | None] | None = ...,
@@ -461,7 +441,6 @@ def from_pretrained(
       *args: t.Any,
       runtime: t.Literal['ggml', 'transformers'] | None = None,
       quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-      bettertransformer: str | bool | None = None,
       adapter_id: str | None = None,
       adapter_name: str | None = None,
       adapter_map: dict[str, str | None] | None = None,
@@ -478,7 +457,6 @@ def from_pretrained(
     > This is most notable during serving time.
 
     - quantize: quantize the model with the given quantization method. Currently supported int8, int4 quantization
-    - bettertransformer: Apply FasterTransformer to given pretrained weight
 
     > Currently, the above two options are mutually exclusive.
 
@@ -518,7 +496,6 @@ def from_pretrained(
         quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
         serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
                       Default behaviour is similar to ``safe_serialization=False``.
-        bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
         adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
         adapter_name: The adapter name to use for this LLM. Defaults to None.
         adapter_map: The adapter map to use for this LLM. Defaults to None. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
@@ -569,7 +546,6 @@ def from_pretrained(
         _tag=_tag,
         _serialisation_format=serialisation,
         _local=_local,
-        bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__['bettertransformer']), default=None)).upper() in ENV_VARS_TRUE_VALUES,
         _runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal['ggml', 'transformers']], os.environ.get(cfg_cls.__openllm_env__['runtime'])), default=cfg_cls.__openllm_runtime__),
         _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
         **attrs
@@ -624,7 +600,6 @@ def __init__(
       *args: t.Any,
       model_id: str,
       llm_config: LLMConfig,
-      bettertransformer: bool | None,
       quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
       _adapters_mapping: AdaptersMapping | None,
       _tag: bentoml.Tag,
@@ -713,7 +688,6 @@ def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
         model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
         llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
                     will use `config_class` to construct default configuration.
-        bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
         quantization_config: ``transformers.BitsAndBytesConfig`` configuration, or 'gptq' denoting this model to be loaded with GPTQ.
         *args: The args to be passed to the model.
         **attrs: The kwargs to be passed to the model.
@@ -755,11 +729,6 @@ def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
     )
 
     self.llm_post_init()
-    # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
-    if bettertransformer is True: self.bettertransformer = bettertransformer
-    else: non_intrusive_setattr(self, 'bettertransformer', self.config['bettertransformer'])
-    # If lora is passed, the disable bettertransformer
-    if _adapters_mapping and self.bettertransformer is True: self.bettertransformer = False
 
   def __setattr__(self, attr: str, value: t.Any) -> None:
     if attr in _reserved_namespace:
@@ -1193,7 +1162,6 @@ def Runner(
     llm_config: LLMConfig | None = ...,
     runtime: t.Literal['ggml', 'transformers'] | None = ...,
     quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
-    bettertransformer: str | bool | None = ...,
     adapter_id: str | None = ...,
     adapter_name: str | None = ...,
     adapter_map: dict[str, str | None] | None = ...,
@@ -1239,7 +1207,6 @@ def download():
   if llm_config is not None:
     attrs.update({
         'model_id': llm_config['env']['model_id_value'],
-        'bettertransformer': llm_config['env']['bettertransformer_value'],
         'quantize': llm_config['env']['quantize_value'],
         'runtime': llm_config['env']['runtime_value'],
         'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
@@ -1369,14 +1336,12 @@ def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
     yield 'llm_type', __self.llm_type
     yield 'runtime', self.runtime
     yield 'llm_tag', self.tag
-    yield 'llm_framework', self.__llm_implementation__
 
   return types.new_class(
       self.__class__.__name__ + 'Runner', (bentoml.Runner,),
       exec_body=lambda ns: ns.update({
           'llm_type': self.llm_type,
           'identifying_params': self.identifying_params,
-          'llm_framework': self.__llm_implementation__,
           'llm_tag': self.tag,
           'llm': self,
           'config': self.config,

@@ -65,7 +65,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
         'model_id': runner.llm.model_id,
         'timeout': 3600,
         'model_name': llm_config['model_name'],
-        'framework': runner.llm_framework,
+        'framework': runner.implementation,
         'configuration': '',
         'supports_embeddings': runner.supports_embeddings,
         'supports_hf_agent': runner.supports_hf_agent

@@ -120,7 +120,6 @@ def construct_docker_options(
     _: FS,
     workers_per_resource: float,
     quantize: LiteralString | None,
-    bettertransformer: bool | None,
     adapter_map: dict[str, str | None] | None,
     dockerfile_template: str | None,
     runtime: t.Literal['ggml', 'transformers'],
@@ -146,9 +145,8 @@ def construct_docker_options(
   if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')
 
   # We need to handle None separately here, as env from subprocess doesn't accept None value.
-  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize, runtime=runtime)
 
-  env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
   if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
   env_dict[_env.runtime] = _env['runtime_value']
   return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
@@ -203,7 +201,6 @@ def create_bento(
     llm: openllm.LLM[t.Any, t.Any],
     workers_per_resource: str | float,
     quantize: LiteralString | None,
-    bettertransformer: bool | None,
     dockerfile_template: str | None,
     adapter_map: dict[str, str | None] | None = None,
     extra_dependencies: tuple[str, ...] | None = None,
@@ -243,7 +240,7 @@ def create_bento(
       python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
       models=[llm_spec],
       docker=construct_docker_options(
-          llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
+          llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
       )
   )