Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(breaking): unify LLM API #283

Merged
merged 9 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor: update naming and envvar
Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
  • Loading branch information
aarnphm committed Sep 1, 2023
commit 025633a90b3e55ade6f9e0f31075eede42ebbcd1
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -407,24 +407,29 @@ pip install "openllm[baichuan]"
### Runtime Implementations (Experimental)

Different LLMs may have multiple runtime implementations. For instance, they
might use Pytorch (`pt`), Tensorflow (`tf`), or Flax (`flax`).
might use Pytorch (`pt`), Tensorflow (`tf`), Flax (`flax`) or vLLM (`vllm`).

If you wish to specify a particular runtime for a model, you can do so by
setting the `OPENLLM_{MODEL_NAME}_FRAMEWORK={runtime}` environment variable
setting the `OPENLLM_BACKEND={runtime}` environment variable
before running `openllm start`.

For example, if you want to use the Tensorflow (`tf`) implementation for the
`flan-t5` model, you can use the following command:

```bash
OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
OPENLLM_BACKEND=tf openllm start flan-t5

openllm start flan-t5 --backend tf
```

> [!NOTE]
> For GPU support on Flax, refers to
> [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)
> to make sure that you have Jax support for the corresponding CUDA version.
> [!IMPORTANT]
> To use vLLM backend, at least a GPU with Ampere or newer architecture and CUDA 11.8 is required.
### Quantisation

OpenLLM supports quantisation with
Expand Down
1 change: 0 additions & 1 deletion hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ check-stubs = [
inplace-changelog = "towncrier build --version main --keep"
quality = [
"./tools/dependencies.py",
"./tools/update-readme.py",
"- ./tools/update-brew-tap.py",
"bash ./tools/sync-readme.sh",
"check-stubs",
Expand Down
6 changes: 3 additions & 3 deletions openllm-client/src/openllm_client/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _hf_agent(self) -> transformers.HfAgent:
raise RuntimeError(
"transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
if not self.supports_hf_agent:
raise RuntimeError(f'{self.model_name} ({self.framework}) does not support running HF agent.')
raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
if not is_transformers_supports_agent():
raise RuntimeError(
"Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'"
Expand All @@ -125,9 +125,9 @@ def model_id(self) -> str:
raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None

@property
def framework(self) -> LiteralBackend:
def backend(self) -> LiteralBackend:
try:
return self._metadata['framework']
return self._metadata['backend']
except KeyError:
raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None

Expand Down
48 changes: 22 additions & 26 deletions openllm-core/src/openllm_core/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ class ModelSettings(t.TypedDict, total=False):
architecture: Required[str]

# default OpenLLM runtime imlementation
default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]
default_backend: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]

# meta
url: str
Expand Down Expand Up @@ -594,7 +594,7 @@ class ModelSettings(t.TypedDict, total=False):

_transformed_type: DictStrAny = {
'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig],
'default_implementation': t.Dict[LiteralResourceSpec, LiteralBackend]
'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend]
}

@attr.define(frozen=False,
Expand Down Expand Up @@ -625,7 +625,7 @@ def default(cls) -> _ModelSettingsAttr:
ModelSettings(default_id='__default__',
model_ids=['__default__'],
architecture='PreTrainedModel',
default_implementation={
default_backend={
'cpu': 'pt',
'nvidia.com/gpu': 'pt'
},
Expand All @@ -646,7 +646,7 @@ def default(cls) -> _ModelSettingsAttr:
default_id: str
model_ids: ListStr
architecture: str
default_implementation: t.Dict[LiteralResourceSpec, LiteralBackend]
default_backend: t.Dict[LiteralResourceSpec, LiteralBackend]
url: str
requires_gpu: bool
trust_remote_code: bool
Expand All @@ -664,15 +664,14 @@ def default(cls) -> _ModelSettingsAttr:
# update-config-stubs.py: attrs stop

# a heuristic cascading implementation resolver based on available resources
def get_default_implementation(
default_implementation_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
available_spec = available_resource_spec()
if resource_spec('tpu') in available_spec: return default_implementation_mapping.get(resource_spec('tpu'), 'pt')
elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt')
if resource_spec('tpu') in available_spec: return backend_mapping.get(resource_spec('tpu'), 'pt')
elif resource_spec('amd') in available_spec: return backend_mapping.get(resource_spec('amd'), 'pt')
elif resource_spec('nvidia') in available_spec:
return default_implementation_mapping.get(resource_spec('nvidia'), 'pt')
return backend_mapping.get(resource_spec('nvidia'), 'pt')
else:
return default_implementation_mapping.get(resource_spec('cpu'), 'pt')
return backend_mapping.get(resource_spec('cpu'), 'pt')

def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr:
if 'generation_class' in cl_.__config__:
Expand All @@ -698,14 +697,14 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _

model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name
# if the default implementation dependencies doesn't exist, then always fallback to 'pt'
default_implementation = _settings_attr.default_implementation
for rs, runtime in default_implementation.items():
default_backend = _settings_attr.default_backend
for rs, runtime in default_backend.items():
library_stub = 'torch' if runtime == 'pt' else runtime
if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = 'pt'
_final_value_dct['default_implementation'] = default_implementation
if not BACKENDS_MAPPING[library_stub][0](): default_backend[rs] = 'pt'
_final_value_dct['default_backend'] = default_backend

env = openllm_core.utils.EnvVarMixin(model_name,
get_default_implementation(default_implementation),
backend=get_default_backend(default_backend),
model_id=_settings_attr.default_id)
_final_value_dct['env'] = env

Expand Down Expand Up @@ -861,11 +860,8 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
```bash
openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
```'''
__openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
'''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.

It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
'''
__openllm_default_backend__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
'''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')'''
__openllm_url__: str = Field(None)
'''The resolved url for this LLMConfig.'''
__openllm_requires_gpu__: bool = Field(None)
Expand Down Expand Up @@ -1193,8 +1189,8 @@ def _make_subclass(class_attr: str,
annotated_names.add(attr_name)
val = cd.get(attr_name, attr.NOTHING)
if not isinstance(val, _CountingAttr):
if val is attr.NOTHING: val = cls.Field(env=field_env_key(cls.__openllm_model_name__, attr_name))
else: val = cls.Field(default=val, env=field_env_key(cls.__openllm_model_name__, attr_name))
if val is attr.NOTHING: val = cls.Field(env=field_env_key(attr_name))
else: val = cls.Field(default=val, env=field_env_key(attr_name))
these[attr_name] = val
unannotated = ca_names - annotated_names
if len(unannotated) > 0:
Expand Down Expand Up @@ -1274,7 +1270,7 @@ def __getitem__(self, item: t.Literal['model_ids']) -> ListStr: ...
@overload
def __getitem__(self, item: t.Literal['architecture']) -> str: ...
@overload
def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
def __getitem__(self, item: t.Literal['default_backend']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
@overload
def __getitem__(self, item: t.Literal['url']) -> str: ...
@overload
Expand Down Expand Up @@ -1640,9 +1636,9 @@ def peft_task_type(cls) -> str:
return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]

@classmethod
def default_implementation(cls) -> LiteralBackend:
return first_not_none(cls.__openllm_env__['framework_value'],
default=get_default_implementation(cls.__openllm_default_implementation__))
def default_backend(cls) -> LiteralBackend:
return first_not_none(cls.__openllm_env__['backend_value'],
default=get_default_backend(cls.__openllm_default_backend__))

def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
'''This handler will sanitize all attrs and setup prompt text.
Expand Down
2 changes: 1 addition & 1 deletion openllm-core/src/openllm_core/_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class MetadataOutput:
model_id: str
timeout: int
model_name: str
framework: str
backend: str
configuration: str
supports_embeddings: bool
supports_hf_agent: bool
Expand Down
5 changes: 4 additions & 1 deletion openllm-core/src/openllm_core/_typing_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
't.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]'
)

def get_literal_args(typ: t.Any) -> tuple[str, ...]:
return getattr(typ, '__args__')

AnyCallable = t.Callable[..., t.Any]
DictStrAny = t.Dict[str, t.Any]
ListAny = t.List[t.Any]
Expand Down Expand Up @@ -109,7 +112,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
identifying_params: dict[str, t.Any]
llm: openllm.LLM[M, T]
config: openllm.LLMConfig
implementation: LiteralBackend
backend: LiteralBackend
supports_embeddings: bool
supports_hf_agent: bool
has_adapters: bool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.

\b
- To use Flax, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="flax"``
- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"``

\b
- To use Tensorflow, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="tf"``
- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"``

\b
FLAN-T5 Runner will use google/flan-t5-large as the default model. To change to any other FLAN-T5
saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_MODEL_ID='google/flan-t5-xxl'``
or provide `--model-id` flag when running ``openllm start flan-t5``:

\b
Expand Down
9 changes: 6 additions & 3 deletions openllm-core/src/openllm_core/config/configuration_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@
This model will also supports PyTorch.

\b
- To use PyTorch, set the environment variable ``OPENLLM_LLAMA_FRAMEWORK="pt"``
- To use PyTorch, set the environment variable ``OPENLLM_BACKEND="pt"``

\b
- To use vLLM, set the environment variable ``OPENLLM_BACKEND="vllm"``

\b
Llama Runner will use decapoda-research/llama-7b-hf as the default model. To change to any other Llama
saved pretrained, or a fine-tune Llama, provide ``OPENLLM_LLAMA_MODEL_ID='openlm-research/open_llama_7b_v2'``
saved pretrained, or a fine-tune Llama, provide ``OPENLLM_MODEL_ID='openlm-research/open_llama_7b_v2'``
or provide `--model-id` flag when running ``openllm start llama``:

\b
Expand Down Expand Up @@ -70,7 +73,7 @@ class LlamaConfig(openllm_core.LLMConfig):
'lowercase',
'url':
'https://github.com/facebookresearch/llama',
'default_implementation': {
'default_backend': {
'cpu': 'pt',
'nvidia.com/gpu': 'pt'
},
Expand Down
6 changes: 3 additions & 3 deletions openllm-core/src/openllm_core/config/configuration_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.

\b
- To use Flax, set the environment variable ``OPENLLM_OPT_FRAMEWORK="flax"``
- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"``

\b
- To use Tensorflow, set the environment variable ``OPENLLM_OPT_FRAMEWORK="tf"``
- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"``

\b
OPT Runner will use facebook/opt-2.7b as the default model. To change to any other OPT
saved pretrained, or a fine-tune OPT, provide ``OPENLLM_OPT_MODEL_ID='facebook/opt-6.7b'``
saved pretrained, or a fine-tune OPT, provide ``OPENLLM_MODEL_ID='facebook/opt-6.7b'``
or provide `--model-id` flag when running ``openllm start opt``:

\b
Expand Down
12 changes: 8 additions & 4 deletions openllm-core/src/openllm_core/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1'
def device_count() -> int:
return len(available_devices())

def check_bool_env(env: str, default: bool = True) -> bool:
return os.environ.get(env, str(default)).upper() in ENV_VARS_TRUE_VALUES

# equivocal setattr to save one lookup per assignment
_object_setattr = object.__setattr__

Expand All @@ -108,10 +111,10 @@ def field_env_key(key: str, suffix: str | None = None) -> str:
return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key])))

# Special debug flag controled via OPENLLMDEVDEBUG
DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment) or (str(os.environ.get(
DEV_DEBUG_VAR, None)).upper() in ENV_VARS_TRUE_VALUES)
DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False))
# Whether to show the codenge for debug purposes
SHOW_CODEGEN: bool = DEBUG and int(os.environ.get('OPENLLMDEVDEBUG', str(0))) > 3
SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and
int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3)
# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
MYPY = False

Expand Down Expand Up @@ -195,6 +198,7 @@ def configure_logging() -> None:
_LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.ERROR
_LOGGING_CONFIG['root']['level'] = logging.ERROR
elif get_debug_mode() or DEBUG:
_LOGGING_CONFIG['handlers']['defaulthandler']['level'] = logging.DEBUG
_LOGGING_CONFIG['loggers']['openllm']['level'] = logging.DEBUG
_LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.DEBUG
_LOGGING_CONFIG['root']['level'] = logging.DEBUG
Expand Down Expand Up @@ -332,8 +336,8 @@ def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t
'analytics': [],
'codegen': [],
'dantic': [],
'lazy': [],
'representation': ['ReprMixin'],
'lazy': ['LazyModule'],
'import_utils': [
'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available',
'is_einops_available', 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available',
Expand Down
3 changes: 1 addition & 2 deletions openllm-core/src/openllm_core/utils/analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@

# This variable is a proxy that will control BENTOML_DO_NOT_TRACK
OPENLLM_DO_NOT_TRACK = 'OPENLLM_DO_NOT_TRACK'
DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()

@functools.lru_cache(maxsize=1)
def do_not_track() -> bool:
return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
return openllm_core.utils.check_bool_env(OPENLLM_DO_NOT_TRACK)

@functools.lru_cache(maxsize=1)
def _usage_event_debugging() -> bool:
Expand Down
Loading