Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(breaking): unify LLM API #283

Merged
merged 9 commits into from
Sep 1, 2023
Prev Previous commit
Next Next commit
chore: update overload types
Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
  • Loading branch information
aarnphm committed Sep 1, 2023
commit e292fe02f106829e13155f2f47b14d6a0ca49a2c
30 changes: 16 additions & 14 deletions openllm-python/src/openllm/_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,17 +1118,16 @@ def Runner(model_name: str,
max_batch_size: int | None = ...,
max_latency_ms: int | None = ...,
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
embedded: t.Literal[True, False] = ...,
scheduling_strategy: type[bentoml.Strategy] | None = ...,
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
...

@overload
def Runner(model_name: str,
*,
ensure_available: bool | None = None,
ensure_available: bool = ...,
init_local: bool = ...,
implementation: LiteralBackend | None = None,
backend: LiteralBackend | None = None,
llm_config: LLMConfig | None = None,
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
...
Expand Down Expand Up @@ -1192,9 +1191,13 @@ def download():
first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
})

default_backend = llm_config.default_backend() if llm_config is not None else 'pt'
backend = t.cast(LiteralBackend,
first_not_none(backend, default=EnvVarMixin(model_name, backend=default_backend)['backend_value']))
backend = t.cast(
LiteralBackend,
first_not_none(backend,
default=EnvVarMixin(
model_name,
backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
if init_local: ensure_available = True
runner = infer_auto_class(backend).create_runner(model_name,
llm_config=llm_config,
ensure_available=ensure_available,
Expand All @@ -1218,9 +1221,8 @@ class _Runnable(bentoml.Runnable):
backend = self.__llm_backend__

def __init__(__self: _Runnable):
# NOTE: The side effect of this line
# is that it will load the imported model during
# runner startup. So don't remove it!!
# NOTE: The side effect of this line is that it will load the
# imported model during runner startup. So don't remove it!!
if not self.model: raise RuntimeError('Failed to load the model correctly (See traceback above)')
if self.adapters_mapping is not None:
logger.info('Applying LoRA to %s...', self.runner_name)
Expand All @@ -1232,31 +1234,31 @@ def set_adapter(__self: _Runnable, adapter_name: str) -> None:
if adapter_name != 'default': self.model.set_adapter(adapter_name)
logger.info('Successfully apply LoRA layer %s', adapter_name)

@bentoml.Runnable.method(**method_signature(embeddings_sig))
@bentoml.Runnable.method(**method_signature(embeddings_sig)) # type: ignore
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[EmbeddingsOutput]:
return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]

@bentoml.Runnable.method(**method_signature(generate_sig))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate(prompt, **attrs)

@bentoml.Runnable.method(**method_signature(generate_sig))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
return self.generate(prompt, **attrs)

@bentoml.Runnable.method(**method_signature(generate_sig))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def generate_one(__self: _Runnable, prompt: str, stop: list[str],
**attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate_one(prompt, stop, **attrs)

@bentoml.Runnable.method(**method_signature(generate_iterator_sig))
@bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore
def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
Expand Down
5 changes: 4 additions & 1 deletion openllm-python/src/openllm/bundle/_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,10 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
return PythonOptions(packages=packages,
wheels=wheels,
lock_packages=False,
extra_index_url=['https://download.pytorch.org/whl/cu118'])
extra_index_url=[
'https://download.pytorch.org/whl/cu118',
'https://huggingface.github.io/autogptq-index/whl/cu118/'
])

def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
Expand Down