Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: continuous batching with vLLM #349

Merged
merged 5 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
chore: add changeloe
Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>
  • Loading branch information
aarnphm committed Sep 14, 2023
commit 7e59e1e76442239e1aa8c06f363f73523f2467f4
3 changes: 3 additions & 0 deletions changelog.d/349.feat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Added support for continuous batching via vLLM

Currently benchmark shows that 100 concurrent requests shows around 1218 TPS on 1 A100 running meta-llama/Llama-2-13b-chat-hf
14 changes: 7 additions & 7 deletions openllm-python/src/openllm/_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,30 +1172,30 @@ def set_adapter(__self: _Runnable, adapter_name: str) -> None:
if adapter_name != 'default': self.model.set_adapter(adapter_name)
logger.info('Successfully apply LoRA layer %s', adapter_name)

@bentoml.Runnable.method(**method_signature(embeddings_sig))
@bentoml.Runnable.method(**method_signature(embeddings_sig)) # type: ignore
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[EmbeddingsOutput]:
return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]

@bentoml.Runnable.method(**method_signature(generate_sig))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate(prompt, **attrs)

@bentoml.Runnable.method(**method_signature(generate_sig))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
return self.generate(prompt, **attrs)

@bentoml.Runnable.method(**method_signature(generate_sig))
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate_one(prompt, stop, **attrs)

@bentoml.Runnable.method(**method_signature(generate_iterator_sig))
@bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore
def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
Expand All @@ -1209,8 +1209,8 @@ def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Gener
yield ' '.join(output_text[pre:]) + ' '
return ' '.join(output_text) + ' '

@bentoml.Runnable.method(**method_signature(generate_iterator_sig))
async def vllm_generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.AsyncGenerator[bytes, None]:
@bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore
async def vllm_generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.AsyncGenerator[str, None]:
# TODO: System prompt support
pre = 0
prompt = process_prompt(prompt, None, False)
Expand Down