Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(breaking): unify LLM API #283

Merged
merged 9 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
chore: remove bettertransformer
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
  • Loading branch information
aarnphm committed Aug 30, 2023
commit 939a1dc49abbf1c6ebf66dd39f6d539b5e00bc0a
5 changes: 0 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -721,9 +721,6 @@ No significant changes.
`openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ`
quantization support is on the roadmap and currently being worked on.

`openllm start` now also support `--bettertransformer` to use
`BetterTransformer` for serving.

Refactored `openllm.LLMConfig` to be able to use with `__getitem__`:
`openllm.DollyV2Config()['requirements']`.

Expand All @@ -732,8 +729,6 @@ No significant changes.

Added `towncrier` workflow to easily generate changelog entries

Added `use_pipeline`, `bettertransformer` flag into ModelSettings

`LLMConfig` now supported `__dataclass_transform__` protocol to help with
type-checking

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
(s/def ::model_id (s/coll-of string? :kind vector?)) ;; model_id is a vector of all models for a given model_type
(s/def ::url string?) ;; url to the model's page
(s/def ::requires_gpu boolean?) ;; whether the model requires a gpu
(s/def ::runtime_impl ::vec-of-runtimes?) ;; supported runtimes
(s/def ::backend ::vec-of-runtimes?) ;; supported runtimes
(s/def ::installation string?) ;; installation instructions (pip command)
(s/def ::model-spec (s/keys :req-un [::model_id ::url ::requires_gpu ;; the spec for a single model (aggregates all the above)
::runtime_impl ::installation]))
::backend ::installation]))
(s/def ::all-models #(or loading-text ;; -- this is the case when the file with the model data has not been loaded yet by the ::set-model-data effect
(s/map-of keyword? ::model-spec))) ;; map of all models

Expand Down
13 changes: 1 addition & 12 deletions openllm-core/src/openllm_core/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ class GenerationConfig:
from ._typing_compat import Self
from ._typing_compat import overload
from .exceptions import ForbiddenAttributeError
from .utils import ENV_VARS_TRUE_VALUES
from .utils import MYPY
from .utils import LazyLoader
from .utils import ReprMixin
Expand Down Expand Up @@ -478,7 +477,6 @@ class ModelSettings(t.TypedDict, total=False):
requirements: t.Optional[ListStr]

# llm implementation specifics
bettertransformer: bool
model_type: t.Literal['causal_lm', 'seq2seq_lm']
runtime: t.Literal['transformers', 'ggml']

Expand Down Expand Up @@ -565,7 +563,6 @@ def default(cls) -> _ModelSettingsAttr:
trust_remote_code: bool
service_name: str
requirements: t.Optional[ListStr]
bettertransformer: bool
model_type: t.Literal['causal_lm', 'seq2seq_lm']
runtime: t.Literal['transformers', 'ggml']
name_type: t.Optional[t.Literal['dasherize', 'lowercase']]
Expand Down Expand Up @@ -610,13 +607,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = 'pt'
_final_value_dct['default_implementation'] = default_implementation

env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id)
_final_value_dct['env'] = env

# bettertransformer support
if _settings_attr['bettertransformer'] is None: _final_value_dct['bettertransformer'] = str(env['bettertransformer_value']).upper() in ENV_VARS_TRUE_VALUES
# if requires_gpu is True, then disable BetterTransformer for quantization.
if _settings_attr['requires_gpu']: _final_value_dct['bettertransformer'] = False
_final_value_dct['service_name'] = f'generated_{model_name}_service.py'

# NOTE: The key for fine-tune strategies is 'fine_tune_strategies'
Expand Down Expand Up @@ -771,8 +764,6 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
__openllm_requirements__: t.Optional[ListStr] = Field(None)
'''The default PyPI requirements needed to run this given LLM. By default, we will depend on
bentoml, torch, transformers.'''
__openllm_bettertransformer__: bool = Field(None)
'''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.'''
__openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None)
'''The model type for this given LLM. By default, it should be causal language modeling.
Currently supported 'causal_lm' or 'seq2seq_lm'
Expand Down Expand Up @@ -1149,8 +1140,6 @@ def __getitem__(self, item: t.Literal['service_name']) -> str: ...
@overload
def __getitem__(self, item: t.Literal['requirements']) -> t.Optional[ListStr]: ...
@overload
def __getitem__(self, item: t.Literal['bettertransformer']) -> bool: ...
@overload
def __getitem__(self, item: t.Literal['model_type']) -> t.Literal['causal_lm', 'seq2seq_lm']: ...
@overload
def __getitem__(self, item: t.Literal['runtime']) -> t.Literal['transformers', 'ggml']: ...
Expand Down
2 changes: 1 addition & 1 deletion openllm-core/src/openllm_core/_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def _from_system(cls: type[DynResource]) -> list[str]:
if visible_devices is None:
if cls.resource_id == 'amd.com/gpu':
if not psutil.LINUX:
if DEBUG: warnings.warn('AMD GPUs is currently only supported on Linux.', stacklevel=_STACK_LEVEL)
if DEBUG: logger.debug('AMD GPUs is currently only supported on Linux.')
return []
# ROCm does not currently have the rocm_smi wheel.
# So we need to use the ctypes bindings directly.
Expand Down
1 change: 0 additions & 1 deletion openllm-core/src/openllm_core/_typing_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
__module__: str
llm_type: str
llm_tag: bentoml.Tag
llm_framework: LiteralRuntime
identifying_params: dict[str, t.Any]
llm: openllm.LLM[M, T]
config: openllm.LLMConfig
Expand Down
19 changes: 2 additions & 17 deletions openllm-core/src/openllm_core/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,6 @@ class EnvVarMixin(ReprMixin):
model_id: str
quantize: str
framework: str
bettertransformer: str
runtime: str

@overload
Expand All @@ -353,10 +352,6 @@ def __getitem__(self, item: t.Literal['quantize']) -> str:
def __getitem__(self, item: t.Literal['framework']) -> str:
...

@overload
def __getitem__(self, item: t.Literal['bettertransformer']) -> str:
...

@overload
def __getitem__(self, item: t.Literal['runtime']) -> str:
...
Expand All @@ -373,10 +368,6 @@ def __getitem__(self, item: t.Literal['quantize_value']) -> t.Literal['int8', 'i
def __getitem__(self, item: t.Literal['model_id_value']) -> str | None:
...

@overload
def __getitem__(self, item: t.Literal['bettertransformer_value']) -> bool:
...

@overload
def __getitem__(self, item: t.Literal['runtime_value']) -> t.Literal['ggml', 'transformers']:
...
Expand All @@ -391,7 +382,6 @@ def __init__(
model_name: str,
implementation: LiteralRuntime = 'pt',
model_id: str | None = None,
bettertransformer: bool | None = None,
quantize: LiteralString | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers'
) -> None:
Expand All @@ -400,10 +390,9 @@ def __init__(
self.model_name = inflection.underscore(model_name)
self._implementation = implementation
self._model_id = model_id
self._bettertransformer = bettertransformer
self._quantize = quantize
self._runtime = runtime
for att in {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}:
for att in {'config', 'model_id', 'quantize', 'framework', 'runtime'}:
setattr(self, att, field_env_key(self.model_name, att.upper()))

def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
Expand All @@ -414,10 +403,6 @@ def _framework_value(self) -> LiteralRuntime:
from . import first_not_none
return t.cast(LiteralRuntime, first_not_none(os.environ.get(self['framework']), default=self._implementation))

def _bettertransformer_value(self) -> bool:
from . import first_not_none
return t.cast(bool, first_not_none(os.environ.get(self['bettertransformer'], str(False)).upper() in ENV_VARS_TRUE_VALUES, default=self._bettertransformer))

def _model_id_value(self) -> str | None:
from . import first_not_none
return first_not_none(os.environ.get(self['model_id']), default=self._model_id)
Expand All @@ -428,7 +413,7 @@ def _runtime_value(self) -> t.Literal['ggml', 'transformers']:

@property
def __repr_keys__(self) -> set[str]:
return {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}
return {'config', 'model_id', 'quantize', 'framework', 'runtime'}

@property
def start_docstring(self) -> str:
Expand Down
35 changes: 0 additions & 35 deletions openllm-python/src/openllm/_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
from openllm_core.utils import generate_hash_from_file
from openllm_core.utils import is_peft_available
from openllm_core.utils import is_torch_available
from openllm_core.utils import non_intrusive_setattr
from openllm_core.utils import normalize_attrs_to_model_tokenizer_pair
from openllm_core.utils import resolve_filepath
from openllm_core.utils import validate_is_path
Expand All @@ -66,7 +65,6 @@
from .utils import infer_auto_class

if t.TYPE_CHECKING:
import pathlib

import auto_gptq as autogptq
import peft
Expand Down Expand Up @@ -214,15 +212,6 @@ def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> T:
'''
raise NotImplementedError

def save_pretrained(self, save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
'''This function defines how this model can be saved to local store.

This will be called during ``import_model``. By default, it will use ``openllm.serialisation.save_pretrained``.
Additionally, the function signature are similar to ``transformers.PreTrainedModel.save_pretrained``
This is useful during fine tuning.
'''
raise NotImplementedError

class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC):
def llm_post_init(self) -> None:
'''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.
Expand Down Expand Up @@ -255,13 +244,6 @@ def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
# NOTE: All fields below are attributes that can be accessed by users.
config_class: t.Type[LLMConfig]
'''The config class to use for this LLM. If you are creating a custom LLM, you must specify this class.'''
bettertransformer: bool
'''Whether to load this LLM with FasterTransformer enabled. The order of loading is:

- If pass within `for_model`, `from_pretrained` or `__init__`, Default to self.config['bettertransformer']

> [!NOTE] that if LoRA is enabled, bettertransformer will be disabled.
'''
device: 'torch.device'
'''The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string.'''
tokenizer_id: t.Union[t.Literal['local'], LiteralString]
Expand Down Expand Up @@ -422,7 +404,6 @@ def from_pretrained(
*args: t.Any,
runtime: t.Literal['ggml', 'transformers'] | None = ...,
quantize: t.Literal['int8', 'int4'] = ...,
bettertransformer: str | bool | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
Expand All @@ -442,7 +423,6 @@ def from_pretrained(
*args: t.Any,
runtime: t.Literal['ggml', 'transformers'] | None = ...,
quantize: t.Literal['gptq'] = ...,
bettertransformer: str | bool | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
Expand All @@ -461,7 +441,6 @@ def from_pretrained(
*args: t.Any,
runtime: t.Literal['ggml', 'transformers'] | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: str | bool | None = None,
adapter_id: str | None = None,
adapter_name: str | None = None,
adapter_map: dict[str, str | None] | None = None,
Expand All @@ -478,7 +457,6 @@ def from_pretrained(
> This is most notable during serving time.

- quantize: quantize the model with the given quantization method. Currently supported int8, int4 quantization
- bettertransformer: Apply FasterTransformer to given pretrained weight

> Currently, the above two options are mutually exclusive.

Expand Down Expand Up @@ -518,7 +496,6 @@ def from_pretrained(
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
Default behaviour is similar to ``safe_serialization=False``.
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
adapter_name: The adapter name to use for this LLM. Defaults to None.
adapter_map: The adapter map to use for this LLM. Defaults to None. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
Expand Down Expand Up @@ -569,7 +546,6 @@ def from_pretrained(
_tag=_tag,
_serialisation_format=serialisation,
_local=_local,
bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__['bettertransformer']), default=None)).upper() in ENV_VARS_TRUE_VALUES,
_runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal['ggml', 'transformers']], os.environ.get(cfg_cls.__openllm_env__['runtime'])), default=cfg_cls.__openllm_runtime__),
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
**attrs
Expand Down Expand Up @@ -624,7 +600,6 @@ def __init__(
*args: t.Any,
model_id: str,
llm_config: LLMConfig,
bettertransformer: bool | None,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
Expand Down Expand Up @@ -713,7 +688,6 @@ def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
will use `config_class` to construct default configuration.
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
quantization_config: ``transformers.BitsAndBytesConfig`` configuration, or 'gptq' denoting this model to be loaded with GPTQ.
*args: The args to be passed to the model.
**attrs: The kwargs to be passed to the model.
Expand Down Expand Up @@ -755,11 +729,6 @@ def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
)

self.llm_post_init()
# we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
if bettertransformer is True: self.bettertransformer = bettertransformer
else: non_intrusive_setattr(self, 'bettertransformer', self.config['bettertransformer'])
# If lora is passed, the disable bettertransformer
if _adapters_mapping and self.bettertransformer is True: self.bettertransformer = False

def __setattr__(self, attr: str, value: t.Any) -> None:
if attr in _reserved_namespace:
Expand Down Expand Up @@ -1193,7 +1162,6 @@ def Runner(
llm_config: LLMConfig | None = ...,
runtime: t.Literal['ggml', 'transformers'] | None = ...,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
bettertransformer: str | bool | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
Expand Down Expand Up @@ -1239,7 +1207,6 @@ def download():
if llm_config is not None:
attrs.update({
'model_id': llm_config['env']['model_id_value'],
'bettertransformer': llm_config['env']['bettertransformer_value'],
'quantize': llm_config['env']['quantize_value'],
'runtime': llm_config['env']['runtime_value'],
'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
Expand Down Expand Up @@ -1369,14 +1336,12 @@ def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
yield 'llm_type', __self.llm_type
yield 'runtime', self.runtime
yield 'llm_tag', self.tag
yield 'llm_framework', self.__llm_implementation__

return types.new_class(
self.__class__.__name__ + 'Runner', (bentoml.Runner,),
exec_body=lambda ns: ns.update({
'llm_type': self.llm_type,
'identifying_params': self.identifying_params,
'llm_framework': self.__llm_implementation__,
'llm_tag': self.tag,
'llm': self,
'config': self.config,
Expand Down
2 changes: 1 addition & 1 deletion openllm-python/src/openllm/_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
'model_id': runner.llm.model_id,
'timeout': 3600,
'model_name': llm_config['model_name'],
'framework': runner.llm_framework,
'framework': runner.implementation,
'configuration': '',
'supports_embeddings': runner.supports_embeddings,
'supports_hf_agent': runner.supports_hf_agent
Expand Down
7 changes: 2 additions & 5 deletions openllm-python/src/openllm/bundle/_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def construct_docker_options(
_: FS,
workers_per_resource: float,
quantize: LiteralString | None,
bettertransformer: bool | None,
adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None,
runtime: t.Literal['ggml', 'transformers'],
Expand All @@ -146,9 +145,8 @@ def construct_docker_options(
if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')

# We need to handle None separately here, as env from subprocess doesn't accept None value.
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize, runtime=runtime)

env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
env_dict[_env.runtime] = _env['runtime_value']
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
Expand Down Expand Up @@ -203,7 +201,6 @@ def create_bento(
llm: openllm.LLM[t.Any, t.Any],
workers_per_resource: str | float,
quantize: LiteralString | None,
bettertransformer: bool | None,
dockerfile_template: str | None,
adapter_map: dict[str, str | None] | None = None,
extra_dependencies: tuple[str, ...] | None = None,
Expand Down Expand Up @@ -243,7 +240,7 @@ def create_bento(
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(
llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
)
)

Expand Down
Loading