Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(breaking): unify LLM API #283

Merged
merged 9 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix: rename backend and cleanup runtime [wip]
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
  • Loading branch information
aarnphm committed Aug 31, 2023
commit f224e70108b4bf747c592013ee6efdf6f21eeae6
3 changes: 1 addition & 2 deletions .github/SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ are backward compatible. We are more lenient with patch as the development can
move quickly.

If you are just using public API, then feel free to always upgrade. Whenever
there is a breaking policies, it will become a `DeprecationWarning` with a
period of 12 months before becoming broken.
there is a breaking policies, it will be announced and will be broken.

> [!WARNING]
> Everything package under `openllm` that has an underscore prefixes
Expand Down
4 changes: 2 additions & 2 deletions openllm-client/src/openllm_client/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import transformers

from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralBackend

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -125,7 +125,7 @@ def model_id(self) -> str:
raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None

@property
def framework(self) -> LiteralRuntime:
def framework(self) -> LiteralBackend:
try:
return self._metadata['framework']
except KeyError:
Expand Down
60 changes: 25 additions & 35 deletions openllm-core/src/openllm_core/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class GenerationConfig:
from ._typing_compat import At
from ._typing_compat import DictStrAny
from ._typing_compat import ListStr
from ._typing_compat import LiteralRuntime
from ._typing_compat import LiteralBackend
from ._typing_compat import LiteralString
from ._typing_compat import NotRequired
from ._typing_compat import Required
Expand Down Expand Up @@ -311,7 +311,7 @@ class GenerationConfig(ReprMixin):
eta_cutoff: float = dantic.Field(
0.0,
description=
'''Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. '''
'Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. '
)
diversity_penalty: float = dantic.Field(
0.0,
Expand Down Expand Up @@ -386,17 +386,17 @@ class GenerationConfig(ReprMixin):
output_attentions: bool = dantic.Field(
False,
description=
'''Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'''
'Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'
)
output_hidden_states: bool = dantic.Field(
False,
description=
'''Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'''
'Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'
)
output_scores: bool = dantic.Field(
False,
description=
'''Whether or not to return the prediction scores. See `scores` under returned tensors for more details.''')
description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.'
)
pad_token_id: int = dantic.Field(description='The id of the *padding* token.')
bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.')
eos_token_id: t.Union[int, t.List[int]] = dantic.Field(
Expand Down Expand Up @@ -564,7 +564,7 @@ class ModelSettings(t.TypedDict, total=False):
architecture: Required[str]

# default OpenLLM runtime imlementation
default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralRuntime]]
default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]

# meta
url: str
Expand All @@ -575,7 +575,6 @@ class ModelSettings(t.TypedDict, total=False):

# llm implementation specifics
model_type: t.Literal['causal_lm', 'seq2seq_lm']
runtime: t.Literal['transformers', 'ggml']

# naming convention, only name_type is needed to infer from the class
# as the three below it can be determined automatically
Expand All @@ -595,7 +594,7 @@ class ModelSettings(t.TypedDict, total=False):

_transformed_type: DictStrAny = {
'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig],
'default_implementation': t.Dict[LiteralResourceSpec, LiteralRuntime]
'default_implementation': t.Dict[LiteralResourceSpec, LiteralBackend]
}

@attr.define(frozen=False,
Expand Down Expand Up @@ -639,23 +638,21 @@ def default(cls) -> _ModelSettingsAttr:
tokenizer_class=None,
timeout=int(36e6),
service_name='',
workers_per_resource=1.,
runtime='transformers')))
workers_per_resource=1.)))

# NOTE: The below are dynamically generated by the field_transformer
if t.TYPE_CHECKING:
# update-config-stubs.py: attrs start
default_id: str
model_ids: ListStr
architecture: str
default_implementation: t.Dict[LiteralResourceSpec, LiteralRuntime]
default_implementation: t.Dict[LiteralResourceSpec, LiteralBackend]
url: str
requires_gpu: bool
trust_remote_code: bool
service_name: str
requirements: t.Optional[ListStr]
model_type: t.Literal['causal_lm', 'seq2seq_lm']
runtime: t.Literal['transformers', 'ggml']
name_type: t.Optional[t.Literal['dasherize', 'lowercase']]
model_name: str
start_name: str
Expand All @@ -668,7 +665,7 @@ def default(cls) -> _ModelSettingsAttr:

# a heuristic cascading implementation resolver based on available resources
def get_default_implementation(
default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime:
default_implementation_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
available_spec = available_resource_spec()
if resource_spec('tpu') in available_spec: return default_implementation_mapping.get(resource_spec('tpu'), 'pt')
elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt')
Expand Down Expand Up @@ -766,16 +763,16 @@ class _ConfigAttr:

@staticmethod
def Field(default: t.Any = None, **attrs: t.Any) -> t.Any:
return dantic.Field(default, **attrs)
'''Field is a alias to the internal dantic utilities to easily create
attrs.fields with pydantic-compatible interface. For example:

'''Field is a alias to the internal dantic utilities to easily create
attrs.fields with pydantic-compatible interface. For example:

```python
class MyModelConfig(openllm.LLMConfig):
field1 = openllm.LLMConfig.Field(...)
```
```python
class MyModelConfig(openllm.LLMConfig):
field1 = openllm.LLMConfig.Field(...)
```
'''
return dantic.Field(default, **attrs)

# NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
if t.TYPE_CHECKING:
# NOTE: public attributes to override
Expand Down Expand Up @@ -864,7 +861,7 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
```bash
openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
```'''
__openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralRuntime] = Field(None)
__openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
'''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.

It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
Expand All @@ -876,16 +873,11 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
__openllm_trust_remote_code__: bool = Field(None)
'''Whether to always trust remote code'''
__openllm_service_name__: str = Field(None)
"""Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'"""
'''Generated service name for this LLMConfig. By default, it is "generated_{model_name}_service.py"'''
__openllm_requirements__: t.Optional[ListStr] = Field(None)
'''The default PyPI requirements needed to run this given LLM. By default, we will depend on
bentoml, torch, transformers.'''
'''The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.'''
__openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None)
'''The model type for this given LLM. By default, it should be causal language modeling.
Currently supported 'causal_lm' or 'seq2seq_lm'
'''
__openllm_runtime__: t.Literal['transformers', 'ggml'] = Field(None)
'''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.'''
'''The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"'''
__openllm_name_type__: t.Optional[t.Literal['dasherize', 'lowercase']] = Field(None)
'''The default name typed for this model. "dasherize" will convert the name to lowercase and
replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
Expand Down Expand Up @@ -1282,7 +1274,7 @@ def __getitem__(self, item: t.Literal['model_ids']) -> ListStr: ...
@overload
def __getitem__(self, item: t.Literal['architecture']) -> str: ...
@overload
def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralRuntime]: ...
def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
@overload
def __getitem__(self, item: t.Literal['url']) -> str: ...
@overload
Expand All @@ -1296,8 +1288,6 @@ def __getitem__(self, item: t.Literal['requirements']) -> t.Optional[ListStr]: .
@overload
def __getitem__(self, item: t.Literal['model_type']) -> t.Literal['causal_lm', 'seq2seq_lm']: ...
@overload
def __getitem__(self, item: t.Literal['runtime']) -> t.Literal['transformers', 'ggml']: ...
@overload
def __getitem__(self, item: t.Literal['name_type']) -> t.Optional[t.Literal['dasherize', 'lowercase']]: ...
@overload
def __getitem__(self, item: t.Literal['model_name']) -> str: ...
Expand Down Expand Up @@ -1650,7 +1640,7 @@ def peft_task_type(cls) -> str:
return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]

@classmethod
def default_implementation(cls) -> LiteralRuntime:
def default_implementation(cls) -> LiteralBackend:
return first_not_none(cls.__openllm_env__['framework_value'],
default=get_default_implementation(cls.__openllm_default_implementation__))

Expand Down
5 changes: 2 additions & 3 deletions openllm-core/src/openllm_core/_typing_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
TupleAny = t.Tuple[t.Any, ...]
At = t.TypeVar('At', bound=attr.AttrsInstance)

LiteralRuntime = t.Literal['pt', 'tf', 'flax', 'vllm']
LiteralBackend = t.Literal['pt', 'tf', 'flax', 'vllm', 'ggml', 'mlc']
AdapterType = t.Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3']

# TODO: support quay
Expand Down Expand Up @@ -96,7 +96,6 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
__call__: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
set_adapter: RunnableMethod[LLMRunnable[M, T], [str], dict[t.Literal['success', 'error_msg'], bool | str]]
embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput]
generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
Expand All @@ -110,7 +109,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
identifying_params: dict[str, t.Any]
llm: openllm.LLM[M, T]
config: openllm.LLMConfig
implementation: LiteralRuntime
implementation: LiteralBackend
supports_embeddings: bool
supports_hf_agent: bool
has_adapters: bool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
treated specially and converted to a single, new token. This retrieves the token ID each of these keys map to.

Args:
tokenizer: the tokenizer
key: the key to convert to a single token
tokenizer: the tokenizer
key: the key to convert to a single token

Raises:
RuntimeError: if more than one ID was generated
RuntimeError: if more than one ID was generated

Returns:
int: the token ID for the given key.
int: the token ID for the given key.
'''
token_ids = tokenizer.encode(key)
if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
Expand Down
12 changes: 6 additions & 6 deletions openllm-core/src/openllm_core/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@
try:
from typing import GenericAlias as _TypingGenericAlias # type: ignore
except ImportError:
_TypingGenericAlias = (
) # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
# python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
_TypingGenericAlias = () # type: ignore
if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
else:
_WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType
) # type: ignore # _GenericAlias is the actual GenericAlias implementation
# _GenericAlias is the actual GenericAlias implementation
_WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore

DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG'

Expand Down Expand Up @@ -104,8 +104,8 @@ def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
_setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
if not hasattr(obj, name): _setattr(name, value)

def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
return '_'.join(filter(None, map(str.upper, ['OPENLLM', model_name, suffix.strip('_') if suffix else '', key])))
def field_env_key(key: str, suffix: str | None = None) -> str:
return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key])))

# Special debug flag controled via OPENLLMDEVDEBUG
DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment) or (str(os.environ.get(
Expand Down
7 changes: 2 additions & 5 deletions openllm-core/src/openllm_core/utils/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,8 @@ def identity(_: str, x_value: t.Any) -> t.Any:
'__model_name': model_name,
})
lines: ListStr = [
'__env = lambda field_name: __field_env(__model_name, field_name, __suffix)', 'return [', ' f.evolve(',
' default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),', ' metadata={',
" 'env': f.metadata.get('env', __env(f.name)),",
" 'description': f.metadata.get('description', '(not provided)'),", ' },', ' )',
' for f in fields', ']'
'__env=lambda field_name:__field_env(field_name,__suffix)',
"return [f.evolve(default=__populate_env(__default_callback(f.name,f.default),__env(f.name)),metadata={'env':f.metadata.get('env',__env(f.name)),'description':f.metadata.get('description', '(not provided)')}) for f in fields]"
]
fields_ann = 'list[attr.Attribute[t.Any]]'
return generate_function(cls,
Expand Down
32 changes: 8 additions & 24 deletions openllm-core/src/openllm_core/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

if t.TYPE_CHECKING:
BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralBackend

logger = logging.getLogger(__name__)
OPTIONAL_DEPENDENCIES = {
Expand Down Expand Up @@ -337,7 +337,6 @@ class EnvVarMixin(ReprMixin):
model_id: str
quantize: str
framework: str
runtime: str

@overload
def __getitem__(self, item: t.Literal['config']) -> str:
Expand All @@ -356,11 +355,7 @@ def __getitem__(self, item: t.Literal['framework']) -> str:
...

@overload
def __getitem__(self, item: t.Literal['runtime']) -> str:
...

@overload
def __getitem__(self, item: t.Literal['framework_value']) -> LiteralRuntime:
def __getitem__(self, item: t.Literal['framework_value']) -> LiteralBackend:
...

@overload
Expand All @@ -371,52 +366,41 @@ def __getitem__(self, item: t.Literal['quantize_value']) -> t.Literal['int8', 'i
def __getitem__(self, item: t.Literal['model_id_value']) -> str | None:
...

@overload
def __getitem__(self, item: t.Literal['runtime_value']) -> t.Literal['ggml', 'transformers']:
...

def __getitem__(self, item: str | t.Any) -> t.Any:
if item.endswith('_value') and hasattr(self, f'_{item}'): return object.__getattribute__(self, f'_{item}')()
elif hasattr(self, item): return getattr(self, item)
raise KeyError(f'Key {item} not found in {self}')

def __init__(self,
model_name: str,
implementation: LiteralRuntime = 'pt',
implementation: LiteralBackend = 'pt',
model_id: str | None = None,
quantize: LiteralString | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers') -> None:
quantize: LiteralString | None = None) -> None:
'''EnvVarMixin is a mixin class that returns the value extracted from environment variables.'''
from openllm_core.utils import field_env_key
self.model_name = inflection.underscore(model_name)
self._implementation = implementation
self._model_id = model_id
self._quantize = quantize
self._runtime = runtime
for att in {'config', 'model_id', 'quantize', 'framework', 'runtime'}:
for att in {'config', 'model_id', 'quantize', 'framework'}:
setattr(self, att, field_env_key(self.model_name, att.upper()))

def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
from . import first_not_none
return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']],
first_not_none(os.environ.get(self['quantize']), default=self._quantize))

def _framework_value(self) -> LiteralRuntime:
def _framework_value(self) -> LiteralBackend:
from . import first_not_none
return t.cast(LiteralRuntime, first_not_none(os.environ.get(self['framework']), default=self._implementation))
return t.cast(LiteralBackend, first_not_none(os.environ.get(self['framework']), default=self._implementation))

def _model_id_value(self) -> str | None:
from . import first_not_none
return first_not_none(os.environ.get(self['model_id']), default=self._model_id)

def _runtime_value(self) -> t.Literal['ggml', 'transformers']:
from . import first_not_none
return t.cast(t.Literal['ggml', 'transformers'],
first_not_none(os.environ.get(self['runtime']), default=self._runtime))

@property
def __repr_keys__(self) -> set[str]:
return {'config', 'model_id', 'quantize', 'framework', 'runtime'}
return {'config', 'model_id', 'quantize', 'framework'}

@property
def start_docstring(self) -> str:
Expand Down
Loading