diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ad59fb239..8297324dc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ default_language_version:
   python: python3.11 # NOTE: sync with .python-version-default
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.6.7"
+    rev: "v0.6.8"
     hooks:
       - id: ruff
         alias: r
diff --git a/src/openllm/__main__.py b/src/openllm/__main__.py
index 30f96582d..74029accb 100644
--- a/src/openllm/__main__.py
+++ b/src/openllm/__main__.py
@@ -9,14 +9,12 @@
 import questionary
 import typer
 
-from openllm.accelerator_spec import (DeploymentTarget, can_run,
-                                      get_local_machine_spec)
+from openllm.accelerator_spec import DeploymentTarget, can_run, get_local_machine_spec
 from openllm.analytic import DO_NOT_TRACK, OpenLLMTyper
 from openllm.clean import app as clean_app
 from openllm.cloud import deploy as cloud_deploy
 from openllm.cloud import ensure_cloud_context, get_cloud_machine_spec
-from openllm.common import (CHECKED, INTERACTIVE, VERBOSE_LEVEL, BentoInfo,
-                            output)
+from openllm.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, BentoInfo, output
 from openllm.local import run as local_run
 from openllm.local import serve as local_serve
 from openllm.model import app as model_app
@@ -24,42 +22,35 @@
 from openllm.repo import app as repo_app
 
 app = OpenLLMTyper(
-    help="`openllm hello` to get started. "
-    "OpenLLM is a CLI tool to manage and deploy open source LLMs and"
-    " get an OpenAI API compatible chat server in seconds."
+    help='`openllm hello` to get started. '
+    'OpenLLM is a CLI tool to manage and deploy open source LLMs and'
+    ' get an OpenAI API compatible chat server in seconds.'
 )
 
-app.add_typer(repo_app, name="repo")
-app.add_typer(model_app, name="model")
-app.add_typer(clean_app, name="clean")
+app.add_typer(repo_app, name='repo')
+app.add_typer(model_app, name='model')
+app.add_typer(clean_app, name='clean')
 
 
 def _select_bento_name(models: list[BentoInfo], target: DeploymentTarget):
     from tabulate import tabulate
 
     options = []
-    model_infos = [
-        (model.repo.name, model.name, can_run(model, target)) for model in models
-    ]
+    model_infos = [(model.repo.name, model.name, can_run(model, target)) for model in models]
     model_name_groups = defaultdict(lambda: 0.0)
     for repo, name, score in model_infos:
         model_name_groups[repo, name] += score
-    table_data = [
-        (name, repo, CHECKED if score > 0 else "")
-        for (repo, name), score in model_name_groups.items()
-    ]
+    table_data = [(name, repo, CHECKED if score > 0 else '') for (repo, name), score in model_name_groups.items()]
     if not table_data:
-        output("No model found", style="red")
+        output('No model found', style='red')
         raise typer.Exit(1)
-    table = tabulate(table_data, headers=["model", "repo", "locally runnable"]).split(
-        "\n"
-    )
-    headers = f"{table[0]}\n   {table[1]}"
+    table = tabulate(table_data, headers=['model', 'repo', 'locally runnable']).split('\n')
+    headers = f'{table[0]}\n   {table[1]}'
 
     options.append(questionary.Separator(headers))
     for table_data, table_line in zip(table_data, table[2:]):
         options.append(questionary.Choice(table_line, value=table_data[:2]))
-    selected = questionary.select("Select a model", options).ask()
+    selected = questionary.select('Select a model', options).ask()
     if selected is None:
         raise typer.Exit(1)
     return selected
@@ -69,26 +60,24 @@ def _select_bento_version(models, target, bento_name, repo):
     from tabulate import tabulate
 
     model_infos = [
-        [model, can_run(model, target)]
-        for model in models
-        if model.name == bento_name and model.repo.name == repo
+        [model, can_run(model, target)] for model in models if model.name == bento_name and model.repo.name == repo
     ]
 
     table_data = [
-        [model.tag, CHECKED if score > 0 else ""]
+        [model.tag, CHECKED if score > 0 else '']
         for model, score in model_infos
         if model.name == bento_name and model.repo.name == repo
     ]
     if not table_data:
-        output(f"No model found for {bento_name} in {repo}", style="red")
+        output(f'No model found for {bento_name} in {repo}', style='red')
         raise typer.Exit(1)
-    table = tabulate(table_data, headers=["version", "locally runnable"]).split("\n")
+    table = tabulate(table_data, headers=['version', 'locally runnable']).split('\n')
 
     options = []
-    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
+    options.append(questionary.Separator(f'{table[0]}\n   {table[1]}'))
     for table_data, table_line in zip(model_infos, table[2:]):
         options.append(questionary.Choice(table_line, value=table_data))
-    selected = questionary.select("Select a version", options).ask()
+    selected = questionary.select('Select a version', options).ask()
     if selected is None:
         raise typer.Exit(1)
     return selected
@@ -100,7 +89,7 @@ def _select_target(bento, targets):
     options = []
     targets.sort(key=lambda x: can_run(bento, x), reverse=True)
     if not targets:
-        output("No available instance type, check your bentocloud account", style="red")
+        output('No available instance type, check your bentocloud account', style='red')
         raise typer.Exit(1)
 
     table = tabulate(
@@ -108,18 +97,18 @@ def _select_target(bento, targets):
             [
                 target.name,
                 target.accelerators_repr,
-                f"${target.price}",
-                CHECKED if can_run(bento, target) else "insufficient res.",
+                f'${target.price}',
+                CHECKED if can_run(bento, target) else 'insufficient res.',
             ]
             for target in targets
         ],
-        headers=["instance type", "accelerator", "price/hr", "deployable"],
-    ).split("\n")
-    options.append(questionary.Separator(f"{table[0]}\n   {table[1]}"))
+        headers=['instance type', 'accelerator', 'price/hr', 'deployable'],
+    ).split('\n')
+    options.append(questionary.Separator(f'{table[0]}\n   {table[1]}'))
 
     for target, line in zip(targets, table[2:]):
-        options.append(questionary.Choice(f"{line}", value=target))
-    selected = questionary.select("Select an instance type", options).ask()
+        options.append(questionary.Choice(f'{line}', value=target))
+    selected = questionary.select('Select an instance type', options).ask()
     if selected is None:
         raise typer.Exit(1)
     return selected
@@ -128,102 +117,84 @@ def _select_target(bento, targets):
 def _select_action(bento: BentoInfo, score):
     if score > 0:
         options = [
-            questionary.Separator("Available actions"),
-            questionary.Choice(
-                "0. Run the model in terminal", value="run", shortcut_key="0"
-            ),
-            questionary.Separator(f"  $ openllm run {bento}"),
-            questionary.Separator(" "),
+            questionary.Separator('Available actions'),
+            questionary.Choice('0. Run the model in terminal', value='run', shortcut_key='0'),
+            questionary.Separator(f'  $ openllm run {bento}'),
+            questionary.Separator(' '),
+            questionary.Choice('1. Serve the model locally and get a chat server', value='serve', shortcut_key='1'),
+            questionary.Separator(f'  $ openllm serve {bento}'),
+            questionary.Separator(' '),
             questionary.Choice(
-                "1. Serve the model locally and get a chat server",
-                value="serve",
-                shortcut_key="1",
+                '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
             ),
-            questionary.Separator(f"  $ openllm serve {bento}"),
-            questionary.Separator(" "),
-            questionary.Choice(
-                "2. Deploy the model to bentocloud and get a scalable chat server",
-                value="deploy",
-                shortcut_key="2",
-            ),
-            questionary.Separator(f"  $ openllm deploy {bento}"),
+            questionary.Separator(f'  $ openllm deploy {bento}'),
         ]
     else:
         options = [
-            questionary.Separator("Available actions"),
+            questionary.Separator('Available actions'),
             questionary.Choice(
-                "0. Run the model in terminal",
-                value="run",
-                disabled="insufficient res.",
-                shortcut_key="0",
+                '0. Run the model in terminal', value='run', disabled='insufficient res.', shortcut_key='0'
             ),
-            questionary.Separator(f"  $ openllm run {bento}"),
-            questionary.Separator(" "),
+            questionary.Separator(f'  $ openllm run {bento}'),
+            questionary.Separator(' '),
             questionary.Choice(
-                "1. Serve the model locally and get a chat server",
-                value="serve",
-                disabled="insufficient res.",
-                shortcut_key="1",
+                '1. Serve the model locally and get a chat server',
+                value='serve',
+                disabled='insufficient res.',
+                shortcut_key='1',
             ),
-            questionary.Separator(f"  $ openllm serve {bento}"),
-            questionary.Separator(" "),
+            questionary.Separator(f'  $ openllm serve {bento}'),
+            questionary.Separator(' '),
             questionary.Choice(
-                "2. Deploy the model to bentocloud and get a scalable chat server",
-                value="deploy",
-                shortcut_key="2",
+                '2. Deploy the model to bentocloud and get a scalable chat server', value='deploy', shortcut_key='2'
             ),
-            questionary.Separator(f"  $ openllm deploy {bento}"),
+            questionary.Separator(f'  $ openllm deploy {bento}'),
         ]
-    action = questionary.select("Select an action", options).ask()
+    action = questionary.select('Select an action', options).ask()
     if action is None:
         raise typer.Exit(1)
-    if action == "run":
+    if action == 'run':
         try:
             port = random.randint(30000, 40000)
             local_run(bento, port=port)
         finally:
-            output("\nUse this command to run the action again:", style="green")
-            output(f"  $ openllm run {bento}", style="orange")
-    elif action == "serve":
+            output('\nUse this command to run the action again:', style='green')
+            output(f'  $ openllm run {bento}', style='orange')
+    elif action == 'serve':
         try:
             local_serve(bento)
         finally:
-            output("\nUse this command to run the action again:", style="green")
-            output(f"  $ openllm serve {bento}", style="orange")
-    elif action == "deploy":
+            output('\nUse this command to run the action again:', style='green')
+            output(f'  $ openllm serve {bento}', style='orange')
+    elif action == 'deploy':
         ensure_cloud_context()
         targets = get_cloud_machine_spec()
         target = _select_target(bento, targets)
         try:
             cloud_deploy(bento, target)
         finally:
-            output("\nUse this command to run the action again:", style="green")
-            output(
-                f"  $ openllm deploy {bento} --instance-type {target.name}",
-                style="orange",
-            )
+            output('\nUse this command to run the action again:', style='green')
+            output(f'  $ openllm deploy {bento} --instance-type {target.name}', style='orange')
 
 
-@app.command(help="get started interactively")
+@app.command(help='get started interactively')
 def hello():
     INTERACTIVE.set(True)
     # VERBOSE_LEVEL.set(20)
 
     target = get_local_machine_spec()
-    output(f"  Detected Platform: {target.platform}", style="green")
+    output(f'  Detected Platform: {target.platform}', style='green')
     if target.accelerators:
-        output("  Detected Accelerators: ", style="green")
+        output('  Detected Accelerators: ', style='green')
         for a in target.accelerators:
-            output(f"   - {a.model} {a.memory_size}GB", style="green")
+            output(f'   - {a.model} {a.memory_size}GB', style='green')
     else:
-        output("  Detected Accelerators: None", style="yellow")
+        output('  Detected Accelerators: None', style='yellow')
 
     models = list_bento()
     if not models:
-        output(
-            "No model found, you probably need to update the model repo:", style="red"
-        )
-        output("  $ openllm repo update", style="orange")
+        output('No model found, you probably need to update the model repo:', style='red')
+        output('  $ openllm repo update', style='orange')
         raise typer.Exit(1)
 
     bento_name, repo = _select_bento_name(models, target)
@@ -231,12 +202,9 @@ def hello():
     _select_action(bento, score)
 
 
-@app.command(help="start an OpenAI API compatible chat server and chat in browser")
+@app.command(help='start an OpenAI API compatible chat server and chat in browser')
 def serve(
-    model: Annotated[str, typer.Argument()] = "",
-    repo: Optional[str] = None,
-    port: int = 3000,
-    verbose: bool = False,
+    model: Annotated[str, typer.Argument()] = '', repo: Optional[str] = None, port: int = 3000, verbose: bool = False
 ):
     if verbose:
         VERBOSE_LEVEL.set(20)
@@ -245,9 +213,9 @@ def serve(
     local_serve(bento, port=port)
 
 
-@app.command(help="run the model and chat in terminal")
+@app.command(help='run the model and chat in terminal')
 def run(
-    model: Annotated[str, typer.Argument()] = "",
+    model: Annotated[str, typer.Argument()] = '',
     repo: Optional[str] = None,
     port: Optional[int] = None,
     timeout: int = 600,
@@ -262,11 +230,9 @@ def run(
     local_run(bento, port=port, timeout=timeout)
 
 
-@app.command(
-    help="deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)"
-)
+@app.command(help='deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)')
 def deploy(
-    model: Annotated[str, typer.Argument()] = "",
+    model: Annotated[str, typer.Argument()] = '',
     instance_type: Optional[str] = None,
     repo: Optional[str] = None,
     verbose: bool = False,
@@ -281,10 +247,10 @@ def deploy(
     targets = filter(lambda x: can_run(bento, x) > 0, targets)
     targets = sorted(targets, key=lambda x: can_run(bento, x), reverse=True)
     if not targets:
-        output("No available instance type, check your bentocloud account", style="red")
+        output('No available instance type, check your bentocloud account', style='red')
         raise typer.Exit(1)
     target = targets[0]
-    output(f"Recommended instance type: {target.name}", style="green")
+    output(f'Recommended instance type: {target.name}', style='green')
     cloud_deploy(bento, target)
 
 
@@ -292,12 +258,9 @@ def deploy(
 def typer_callback(
     verbose: int = 0,
     do_not_track: bool = typer.Option(
-        False,
-        "--do-not-track",
-        help="Whether to disable usage tracking",
-        envvar=DO_NOT_TRACK,
+        False, '--do-not-track', help='Whether to disable usage tracking', envvar=DO_NOT_TRACK
     ),
-    version: bool = typer.Option(False, "--version", "-v", help="Show version"),
+    version: bool = typer.Option(False, '--version', '-v', help='Show version'),
 ):
     if verbose:
         VERBOSE_LEVEL.set(verbose)
@@ -310,5 +273,5 @@ def typer_callback(
         os.environ[DO_NOT_TRACK] = str(True)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
diff --git a/src/openllm/accelerator_spec.py b/src/openllm/accelerator_spec.py
index 5394f88ae..a6cfdae45 100644
--- a/src/openllm/accelerator_spec.py
+++ b/src/openllm/accelerator_spec.py
@@ -21,14 +21,14 @@ def __eq__(self, other):
         return self.memory_size == other.memory_size
 
     def __repr__(self):
-        return f"{self.model}({self.memory_size}GB)"
+        return f'{self.model}({self.memory_size}GB)'
 
 
 class Resource(SimpleNamespace):
     cpu: int = 0
     memory: float
     gpu: int = 0
-    gpu_type: str = ""
+    gpu_type: str = ''
 
     def __hash__(self):  # type: ignore
         return hash((self.cpu, self.memory, self.gpu, self.gpu_type))
@@ -38,49 +38,53 @@ def __bool__(self):
 
 
 ACCELERATOR_SPEC_DICT: dict[str, dict] = {
-    "nvidia-gtx-1650": {"model": "GTX 1650", "memory_size": 4.0},
-    "nvidia-gtx-1060": {"model": "GTX 1060", "memory_size": 6.0},
-    "nvidia-gtx-1080-ti": {"model": "GTX 1080 Ti", "memory_size": 11.0},
-    "nvidia-rtx-3060": {"model": "RTX 3060", "memory_size": 12.0},
-    "nvidia-rtx-3060-ti": {"model": "RTX 3060 Ti", "memory_size": 8.0},
-    "nvidia-rtx-3070-ti": {"model": "RTX 3070 Ti", "memory_size": 8.0},
-    "nvidia-rtx-3080": {"model": "RTX 3080", "memory_size": 10.0},
-    "nvidia-rtx-3080-ti": {"model": "RTX 3080 Ti", "memory_size": 12.0},
-    "nvidia-rtx-3090": {"model": "RTX 3090", "memory_size": 24.0},
-    "nvidia-rtx-4070-ti": {"model": "RTX 4070 Ti", "memory_size": 12.0},
-    "nvidia-tesla-p4": {"model": "P4", "memory_size": 8.0},
-    "nvidia-tesla-p100": {"model": "P100", "memory_size": 16.0},
-    "nvidia-tesla-k80": {"model": "K80", "memory_size": 12.0},
-    "nvidia-tesla-t4": {"model": "T4", "memory_size": 16.0},
-    "nvidia-tesla-v100": {"model": "V100", "memory_size": 16.0},
-    "nvidia-l4": {"model": "L4", "memory_size": 24.0},
-    "nvidia-tesla-l4": {"model": "L4", "memory_size": 24.0},
-    "nvidia-tesla-a10g": {"model": "A10G", "memory_size": 24.0},
-    "nvidia-a100-80g": {"model": "A100", "memory_size": 80.0},
-    "nvidia-a100-80gb": {"model": "A100", "memory_size": 80.0},
-    "nvidia-tesla-a100": {"model": "A100", "memory_size": 40.0},
+    'nvidia-gtx-1650': {'model': 'GTX 1650', 'memory_size': 4.0},
+    'nvidia-gtx-1060': {'model': 'GTX 1060', 'memory_size': 6.0},
+    'nvidia-gtx-1080-ti': {'model': 'GTX 1080 Ti', 'memory_size': 11.0},
+    'nvidia-rtx-3060': {'model': 'RTX 3060', 'memory_size': 12.0},
+    'nvidia-rtx-3060-ti': {'model': 'RTX 3060 Ti', 'memory_size': 8.0},
+    'nvidia-rtx-3070-ti': {'model': 'RTX 3070 Ti', 'memory_size': 8.0},
+    'nvidia-rtx-3080': {'model': 'RTX 3080', 'memory_size': 10.0},
+    'nvidia-rtx-3080-ti': {'model': 'RTX 3080 Ti', 'memory_size': 12.0},
+    'nvidia-rtx-3090': {'model': 'RTX 3090', 'memory_size': 24.0},
+    'nvidia-rtx-4070-ti': {'model': 'RTX 4070 Ti', 'memory_size': 12.0},
+    'nvidia-tesla-p4': {'model': 'P4', 'memory_size': 8.0},
+    'nvidia-tesla-p100': {'model': 'P100', 'memory_size': 16.0},
+    'nvidia-tesla-k80': {'model': 'K80', 'memory_size': 12.0},
+    'nvidia-tesla-t4': {'model': 'T4', 'memory_size': 16.0},
+    'nvidia-tesla-v100': {'model': 'V100', 'memory_size': 16.0},
+    'nvidia-l4': {'model': 'L4', 'memory_size': 24.0},
+    'nvidia-tesla-l4': {'model': 'L4', 'memory_size': 24.0},
+    'nvidia-tesla-a10g': {'model': 'A10G', 'memory_size': 24.0},
+    'nvidia-a100-80g': {'model': 'A100', 'memory_size': 80.0},
+    'nvidia-a100-80gb': {'model': 'A100', 'memory_size': 80.0},
+    'nvidia-tesla-a100': {'model': 'A100', 'memory_size': 40.0},
 }
 
-ACCELERATOR_SPECS: dict[str, Accelerator] = {
-    key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()
-}
+ACCELERATOR_SPECS: dict[str, Accelerator] = {key: Accelerator(**value) for key, value in ACCELERATOR_SPEC_DICT.items()}
 
 
 @functools.lru_cache
 def get_local_machine_spec():
     if psutil.MACOS:
-        return DeploymentTarget(accelerators=[], source="local", platform="macos")
+        return DeploymentTarget(accelerators=[], source='local', platform='macos')
 
     if psutil.WINDOWS:
-        platform = "windows"
+        platform = 'windows'
     elif psutil.LINUX:
-        platform = "linux"
+        platform = 'linux'
     else:
-        raise NotImplementedError("Unsupported platform")
-
-    from pynvml import (nvmlDeviceGetCount, nvmlDeviceGetCudaComputeCapability,
-                        nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlDeviceGetName, nvmlInit, nvmlShutdown)
+        raise NotImplementedError('Unsupported platform')
+
+    from pynvml import (
+        nvmlDeviceGetCount,
+        nvmlDeviceGetCudaComputeCapability,
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlDeviceGetName,
+        nvmlInit,
+        nvmlShutdown,
+    )
 
     try:
         nvmlInit()
@@ -90,48 +94,37 @@ def get_local_machine_spec():
             handle = nvmlDeviceGetHandleByIndex(i)
             name = nvmlDeviceGetName(handle)
             memory_info = nvmlDeviceGetMemoryInfo(handle)
-            accelerators.append(
-                Accelerator(
-                    model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)
-                )
-            )
+            accelerators.append(Accelerator(model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3)))
             compute_capability = nvmlDeviceGetCudaComputeCapability(handle)
             if compute_capability < (7, 5):
                 output(
-                    f"GPU {name} with compute capability {compute_capability} "
-                    "may not be supported, 7.5 or higher is recommended. check "
-                    "https://developer.nvidia.com/cuda-gpus for more information",
-                    style="yellow",
+                    f'GPU {name} with compute capability {compute_capability} '
+                    'may not be supported, 7.5 or higher is recommended. check '
+                    'https://developer.nvidia.com/cuda-gpus for more information',
+                    style='yellow',
                 )
         nvmlShutdown()
-        return DeploymentTarget(
-            accelerators=accelerators, source="local", platform=platform
-        )
+        return DeploymentTarget(accelerators=accelerators, source='local', platform=platform)
     except Exception as e:
         output(
-            "Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment",
-            style="yellow",
+            'Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment',
+            style='yellow',
         )
-        output(f"Error: {e}", style="red", level=20)
-        return DeploymentTarget(accelerators=[], source="local", platform=platform)
+        output(f'Error: {e}', style='red', level=20)
+        return DeploymentTarget(accelerators=[], source='local', platform=platform)
 
 
 @functools.lru_cache()
-def can_run(
-    bento: typing.Union[Resource, BentoInfo],
-    target: typing.Optional[DeploymentTarget] = None,
-) -> float:
+def can_run(bento: typing.Union[Resource, BentoInfo], target: typing.Optional[DeploymentTarget] = None) -> float:
     """
     Calculate if the bento can be deployed on the target.
     """
     if target is None:
         target = get_local_machine_spec()
 
-    resource_spec = Resource(
-        **(bento.bento_yaml["services"][0]["config"].get("resources", {}))
-    )
-    labels = bento.bento_yaml.get("labels", {})
-    platforms = labels.get("platforms", "linux").split(",")
+    resource_spec = Resource(**(bento.bento_yaml['services'][0]['config'].get('resources', {})))
+    labels = bento.bento_yaml.get('labels', {})
+    platforms = labels.get('platforms', 'linux').split(',')
 
     if target.platform not in platforms:
         return 0.0
@@ -142,18 +135,10 @@ def can_run(
 
     if resource_spec.gpu > 0:
         required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]
-        filtered_accelerators = [
-            ac
-            for ac in target.accelerators
-            if ac.memory_size >= required_gpu.memory_size
-        ]
+        filtered_accelerators = [ac for ac in target.accelerators if ac.memory_size >= required_gpu.memory_size]
         if resource_spec.gpu > len(filtered_accelerators):
             return 0.0
-        return (
-            required_gpu.memory_size
-            * resource_spec.gpu
-            / sum(ac.memory_size for ac in target.accelerators)
-        )
+        return required_gpu.memory_size * resource_spec.gpu / sum(ac.memory_size for ac in target.accelerators)
     if target.accelerators:
         return 0.01 / sum(ac.memory_size for ac in target.accelerators)
     return 1.0
diff --git a/src/openllm/analytic.py b/src/openllm/analytic.py
index 716095eba..c5b898b3f 100644
--- a/src/openllm/analytic.py
+++ b/src/openllm/analytic.py
@@ -12,16 +12,16 @@
 import typer
 import typer.core
 
-DO_NOT_TRACK = "BENTOML_DO_NOT_TRACK"
+DO_NOT_TRACK = 'BENTOML_DO_NOT_TRACK'
 
 
 class EventMeta(ABC):
     @property
     def event_name(self):
         # camel case to snake case
-        event_name = re.sub(r"(?<!^)(?=[A-Z])", "_", self.__class__.__name__).lower()
+        event_name = re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()
         # remove "_event" suffix
-        suffix_to_remove = "_event"
+        suffix_to_remove = '_event'
         if event_name.endswith(suffix_to_remove):
             event_name = event_name[: -len(suffix_to_remove)]
         return event_name
@@ -48,22 +48,16 @@ def list_commands(self, _: click.Context) -> typing.Iterable[str]:  # type: igno
 
 class OpenLLMTyper(typer.Typer):
     def __init__(self, *args: typing.Any, **kwargs: typing.Any):
-        no_args_is_help = kwargs.pop("no_args_is_help", True)
-        context_settings = kwargs.pop("context_settings", {})
-        if "help_option_names" not in context_settings:
-            context_settings["help_option_names"] = ("-h", "--help")
-        if "max_content_width" not in context_settings:
-            context_settings["max_content_width"] = int(
-                os.environ.get("COLUMNS", str(120))
-            )
-        klass = kwargs.pop("cls", OrderedCommands)
+        no_args_is_help = kwargs.pop('no_args_is_help', True)
+        context_settings = kwargs.pop('context_settings', {})
+        if 'help_option_names' not in context_settings:
+            context_settings['help_option_names'] = ('-h', '--help')
+        if 'max_content_width' not in context_settings:
+            context_settings['max_content_width'] = int(os.environ.get('COLUMNS', str(120)))
+        klass = kwargs.pop('cls', OrderedCommands)
 
         super().__init__(
-            *args,
-            cls=klass,
-            no_args_is_help=no_args_is_help,
-            context_settings=context_settings,
-            **kwargs,
+            *args, cls=klass, no_args_is_help=no_args_is_help, context_settings=context_settings, **kwargs
         )
 
     # NOTE: Since OpenLLMTyper only wraps command to add analytics, the default type-hint for @app.command
@@ -79,9 +73,7 @@ def decorator(f):
                 def wrapped(ctx: click.Context, *args, **kwargs):
                     from bentoml._internal.utils.analytics import track
 
-                    do_not_track = (
-                        os.environ.get(DO_NOT_TRACK, str(False)).lower() == "true"
-                    )
+                    do_not_track = os.environ.get(DO_NOT_TRACK, str(False)).lower() == 'true'
 
                     # so we know that the root program is openllm
                     command_name = ctx.info_name
@@ -90,7 +82,7 @@ def wrapped(ctx: click.Context, *args, **kwargs):
                         command_group = ctx.parent.info_name
                     elif ctx.parent.info_name == ctx.find_root().info_name:
                         # openllm run
-                        command_group = "openllm"
+                        command_group = 'openllm'
 
                     if do_not_track:
                         return f(*args, **kwargs)
@@ -100,9 +92,7 @@ def wrapped(ctx: click.Context, *args, **kwargs):
                         duration_in_ns = time.time_ns() - start_time
                         track(
                             OpenllmCliEvent(
-                                cmd_group=command_group,
-                                cmd_name=command_name,
-                                duration_in_ms=duration_in_ns / 1e6,
+                                cmd_group=command_group, cmd_name=command_name, duration_in_ms=duration_in_ns / 1e6
                             )
                         )
                         return return_value
@@ -114,9 +104,7 @@ def wrapped(ctx: click.Context, *args, **kwargs):
                                 cmd_name=command_name,
                                 duration_in_ms=duration_in_ns / 1e6,
                                 error_type=type(e).__name__,
-                                return_code=(
-                                    2 if isinstance(e, KeyboardInterrupt) else 1
-                                ),
+                                return_code=(2 if isinstance(e, KeyboardInterrupt) else 1),
                             )
                         )
                         raise
diff --git a/src/openllm/clean.py b/src/openllm/clean.py
index ebe0afeeb..a5aa11ea7 100644
--- a/src/openllm/clean.py
+++ b/src/openllm/clean.py
@@ -5,20 +5,19 @@
 import questionary
 
 from openllm.analytic import OpenLLMTyper
-from openllm.common import (CONFIG_FILE, REPO_DIR, VENV_DIR, VERBOSE_LEVEL,
-                            output)
+from openllm.common import CONFIG_FILE, REPO_DIR, VENV_DIR, VERBOSE_LEVEL, output
 
-app = OpenLLMTyper(help="clean up and release disk space used by OpenLLM")
+app = OpenLLMTyper(help='clean up and release disk space used by OpenLLM')
 
-HUGGINGFACE_CACHE = pathlib.Path.home() / ".cache" / "huggingface" / "hub"
+HUGGINGFACE_CACHE = pathlib.Path.home() / '.cache' / 'huggingface' / 'hub'
 
 
 def _du(path: pathlib.Path) -> int:
     seen_paths = set()
     used_space = 0
 
-    for f in path.rglob("*"):
-        if os.name == "nt":  # Windows system
+    for f in path.rglob('*'):
+        if os.name == 'nt':  # Windows system
             # On Windows, directly add file sizes without considering hard links
             used_space += f.stat().st_size
         else:
@@ -30,52 +29,52 @@ def _du(path: pathlib.Path) -> int:
     return used_space
 
 
-@app.command(help="Clean up all the cached models from huggingface")
+@app.command(help='Clean up all the cached models from huggingface')
 def model_cache(verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
     used_space = _du(HUGGINGFACE_CACHE)
     sure = questionary.confirm(
-        f"This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
+        f'This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
     ).ask()
     if not sure:
         return
     shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)
-    output("All models cached by Huggingface have been removed", style="green")
+    output('All models cached by Huggingface have been removed', style='green')
 
 
-@app.command(help="Clean up all the virtual environments created by OpenLLM")
+@app.command(help='Clean up all the virtual environments created by OpenLLM')
 def venvs(verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
 
     used_space = _du(VENV_DIR)
     sure = questionary.confirm(
-        f"This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?"
+        f'This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?'
     ).ask()
     if not sure:
         return
     shutil.rmtree(VENV_DIR, ignore_errors=True)
-    output("All virtual environments have been removed", style="green")
+    output('All virtual environments have been removed', style='green')
 
 
-@app.command(help="Clean up all the repositories cloned by OpenLLM")
+@app.command(help='Clean up all the repositories cloned by OpenLLM')
 def repos(verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
     shutil.rmtree(REPO_DIR, ignore_errors=True)
-    output("All repositories have been removed", style="green")
+    output('All repositories have been removed', style='green')
 
 
-@app.command(help="Reset configurations to default")
+@app.command(help='Reset configurations to default')
 def configs(verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
     shutil.rmtree(CONFIG_FILE, ignore_errors=True)
-    output("All configurations have been reset", style="green")
+    output('All configurations have been reset', style='green')
 
 
-@app.command(name="all", help="Clean up all above and bring OpenLLM to a fresh start")
+@app.command(name='all', help='Clean up all above and bring OpenLLM to a fresh start')
 def all_cache(verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
diff --git a/src/openllm/cloud.py b/src/openllm/cloud.py
index 8ad9869d8..83e8363bf 100644
--- a/src/openllm/cloud.py
+++ b/src/openllm/cloud.py
@@ -9,66 +9,61 @@
 
 from openllm.accelerator_spec import ACCELERATOR_SPECS
 from openllm.analytic import OpenLLMTyper
-from openllm.common import (INTERACTIVE, BentoInfo, DeploymentTarget, output,
-                            run_command)
+from openllm.common import INTERACTIVE, BentoInfo, DeploymentTarget, output, run_command
 
 app = OpenLLMTyper()
 
 
 def resolve_cloud_config() -> pathlib.Path:
-    env = os.environ.get("BENTOML_HOME")
+    env = os.environ.get('BENTOML_HOME')
     if env is not None:
-        return pathlib.Path(env) / ".yatai.yaml"
-    return pathlib.Path.home() / "bentoml" / ".yatai.yaml"
+        return pathlib.Path(env) / '.yatai.yaml'
+    return pathlib.Path.home() / 'bentoml' / '.yatai.yaml'
 
 
 def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget] = None):
-    cmd = ["bentoml", "deploy", bento.bentoml_tag]
-    env = {"BENTOML_HOME": f"{bento.repo.path}/bentoml"}
+    cmd = ['bentoml', 'deploy', bento.bentoml_tag]
+    env = {'BENTOML_HOME': f'{bento.repo.path}/bentoml'}
 
-    required_envs = bento.bento_yaml.get("envs", [])
-    required_env_names = [env["name"] for env in required_envs if "name" in env]
+    required_envs = bento.bento_yaml.get('envs', [])
+    required_env_names = [env['name'] for env in required_envs if 'name' in env]
     if required_env_names:
         output(
-            f"This model requires the following environment variables to run: {required_env_names!r}",
-            style="yellow",
+            f'This model requires the following environment variables to run: {required_env_names!r}', style='yellow'
         )
 
-    for env_info in bento.bento_yaml.get("envs", []):
-        if "name" not in env_info:
+    for env_info in bento.bento_yaml.get('envs', []):
+        if 'name' not in env_info:
             continue
-        if os.environ.get(env_info["name"]):
-            default = os.environ[env_info["name"]]
-        elif "value" in env_info:
-            default = env_info["value"]
+        if os.environ.get(env_info['name']):
+            default = os.environ[env_info['name']]
+        elif 'value' in env_info:
+            default = env_info['value']
         else:
-            default = ""
+            default = ''
 
         if INTERACTIVE.get():
             import questionary
 
             value = questionary.text(f"{env_info['name']}:", default=default).ask()
         else:
-            if default == "":
-                output(
-                    f"Environment variable {env_info['name']} is required but not provided",
-                    style="red",
-                )
+            if default == '':
+                output(f"Environment variable {env_info['name']} is required but not provided", style='red')
                 raise typer.Exit(1)
             else:
                 value = default
 
         if value is None:
             raise typer.Exit(1)
-        cmd += ["--env", f"{env_info['name']}={value}"]
+        cmd += ['--env', f"{env_info['name']}={value}"]
 
     if target:
-        cmd += ["--instance-type", target.name]
+        cmd += ['--instance-type', target.name]
 
     base_config = resolve_cloud_config()
     if not base_config.exists():
-        raise Exception("Cannot find cloud config.")
-    shutil.copy(base_config, bento.repo.path / "bentoml" / ".yatai.yaml")
+        raise Exception('Cannot find cloud config.')
+    shutil.copy(base_config, bento.repo.path / 'bentoml' / '.yatai.yaml')
 
     return cmd, env, None
 
@@ -76,90 +71,67 @@ def _get_deploy_cmd(bento: BentoInfo, target: typing.Optional[DeploymentTarget]
 def ensure_cloud_context():
     import questionary
 
-    cmd = ["bentoml", "cloud", "current-context"]
+    cmd = ['bentoml', 'cloud', 'current-context']
     try:
         result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
         context = json.loads(result)
-        output(
-            f"  bentoml already logged in: {context['endpoint']}",
-            style="green",
-            level=20,
-        )
+        output(f"  bentoml already logged in: {context['endpoint']}", style='green', level=20)
     except subprocess.CalledProcessError:
-        output("  bentoml not logged in", style="red")
+        output('  bentoml not logged in', style='red')
         if not INTERACTIVE.get():
-            output("\n  get bentoml logged in by:")
-            output("    $ bentoml cloud login", style="orange")
-            output("")
+            output('\n  get bentoml logged in by:')
+            output('    $ bentoml cloud login', style='orange')
+            output('')
             output(
                 """  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact""",
-                style="yellow",
+                style='yellow',
             )
             raise typer.Exit(1)
         else:
             action = questionary.select(
-                "Choose an action:",
-                choices=[
-                    "I have a BentoCloud account",
-                    "get an account in two minutes",
-                ],
+                'Choose an action:', choices=['I have a BentoCloud account', 'get an account in two minutes']
             ).ask()
             if action is None:
                 raise typer.Exit(1)
-            elif action == "get an account in two minutes":
-                output(
-                    "Please visit https://cloud.bentoml.com to get your token",
-                    style="yellow",
-                )
-            endpoint = questionary.text(
-                "Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)"
-            ).ask()
+            elif action == 'get an account in two minutes':
+                output('Please visit https://cloud.bentoml.com to get your token', style='yellow')
+            endpoint = questionary.text('Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)').ask()
             if endpoint is None:
                 raise typer.Exit(1)
-            token = questionary.text(
-                "Enter your token: (similar to cniluaxxxxxxxx)"
-            ).ask()
+            token = questionary.text('Enter your token: (similar to cniluaxxxxxxxx)').ask()
             if token is None:
                 raise typer.Exit(1)
-            cmd = [
-                "bentoml",
-                "cloud",
-                "login",
-                "--api-token",
-                token,
-                "--endpoint",
-                endpoint,
-            ]
+            cmd = ['bentoml', 'cloud', 'login', '--api-token', token, '--endpoint', endpoint]
             try:
                 result = subprocess.check_output(cmd)
-                output("  Logged in successfully", style="green")
+                output('  Logged in successfully', style='green')
             except subprocess.CalledProcessError:
-                output("  Failed to login", style="red")
+                output('  Failed to login', style='red')
                 raise typer.Exit(1)
 
 
 def get_cloud_machine_spec() -> list[DeploymentTarget]:
     ensure_cloud_context()
-    cmd = ["bentoml", "deployment", "list-instance-types", "-o", "json"]
+    cmd = ['bentoml', 'deployment', 'list-instance-types', '-o', 'json']
     try:
         result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
         instance_types = json.loads(result)
         return [
             DeploymentTarget(
-                source="cloud",
-                name=it["name"],
-                price=it["price"],
-                platform="linux",
+                source='cloud',
+                name=it['name'],
+                price=it['price'],
+                platform='linux',
                 accelerators=(
-                    [ACCELERATOR_SPECS[it["gpu_type"]] for _ in range(int(it["gpu"]))]
-                    if it.get("gpu") and it["gpu_type"] in ACCELERATOR_SPECS
+                    [ACCELERATOR_SPECS[it['gpu_type']] for _ in range(int(it['gpu']))]
+                    if it.get('gpu') and it['gpu_type'] in ACCELERATOR_SPECS
                     else []
                 ),
             )
             for it in instance_types
         ]
     except (subprocess.CalledProcessError, json.JSONDecodeError):
-        output("Failed to get cloud instance types", style="red")
+        output('Failed to get cloud instance types', style='red')
         return []
 
 
diff --git a/src/openllm/common.py b/src/openllm/common.py
index 2bb852991..a575cf831 100644
--- a/src/openllm/common.py
+++ b/src/openllm/common.py
@@ -19,23 +19,23 @@
 import typer
 import typer.core
 
-ERROR_STYLE = "red"
-SUCCESS_STYLE = "green"
+ERROR_STYLE = 'red'
+SUCCESS_STYLE = 'green'
 
-OPENLLM_HOME = pathlib.Path(os.getenv("OPENLLM_HOME", pathlib.Path.home() / ".openllm"))
-REPO_DIR = OPENLLM_HOME / "repos"
-TEMP_DIR = OPENLLM_HOME / "temp"
-VENV_DIR = OPENLLM_HOME / "venv"
+OPENLLM_HOME = pathlib.Path(os.getenv('OPENLLM_HOME', pathlib.Path.home() / '.openllm'))
+REPO_DIR = OPENLLM_HOME / 'repos'
+TEMP_DIR = OPENLLM_HOME / 'temp'
+VENV_DIR = OPENLLM_HOME / 'venv'
 
 REPO_DIR.mkdir(exist_ok=True, parents=True)
 TEMP_DIR.mkdir(exist_ok=True, parents=True)
 VENV_DIR.mkdir(exist_ok=True, parents=True)
 
-CONFIG_FILE = OPENLLM_HOME / "config.json"
+CONFIG_FILE = OPENLLM_HOME / 'config.json'
 
-CHECKED = "Yes"
+CHECKED = 'Yes'
 
-T = typing.TypeVar("T")
+T = typing.TypeVar('T')
 
 
 class ContextVar(typing.Generic[T]):
@@ -75,18 +75,16 @@ def output(content, level=0, style=None, end=None):
 
         out = io.StringIO()
         pyaml.pprint(content, dst=out, sort_dicts=False, sort_keys=False)
-        questionary.print(out.getvalue(), style=style, end="" if end is None else end)
+        questionary.print(out.getvalue(), style=style, end='' if end is None else end)
         out.close()
 
     if isinstance(content, str):
-        questionary.print(content, style=style, end="\n" if end is None else end)
+        questionary.print(content, style=style, end='\n' if end is None else end)
 
 
 class Config(SimpleNamespace):
-    repos: dict[str, str] = {
-        "default": "https://github.com/bentoml/openllm-models@main"
-    }
-    default_repo: str = "default"
+    repos: dict[str, str] = {'default': 'https://github.com/bentoml/openllm-models@main'}
+    default_repo: str = 'default'
 
     def tolist(self):
         return dict(repos=self.repos, default_repo=self.default_repo)
@@ -103,7 +101,7 @@ def load_config() -> Config:
 
 
 def save_config(config: Config) -> None:
-    with open(CONFIG_FILE, "w") as f:
+    with open(CONFIG_FILE, 'w') as f:
         json.dump(config.tolist(), f, indent=2)
 
 
@@ -131,15 +129,13 @@ class RepoInfo(SimpleNamespace):
 
     def tolist(self):
         if VERBOSE_LEVEL.get() <= 0:
-            return f"{self.name} ({self.url}@{self.branch})"
+            return f'{self.name} ({self.url}@{self.branch})'
         if VERBOSE_LEVEL.get() <= 10:
-            return dict(
-                name=self.name, url=f"{self.url}@{self.branch}", path=str(self.path)
-            )
+            return dict(name=self.name, url=f'{self.url}@{self.branch}', path=str(self.path))
         if VERBOSE_LEVEL.get() <= 20:
             return dict(
                 name=self.name,
-                url=f"{self.url}@{self.branch}",
+                url=f'{self.url}@{self.branch}',
                 path=str(self.path),
                 server=self.server,
                 owner=self.owner,
@@ -150,13 +146,13 @@ def tolist(self):
 class BentoInfo(SimpleNamespace):
     repo: RepoInfo
     path: pathlib.Path
-    alias: str = ""
+    alias: str = ''
 
     def __str__(self):
-        if self.repo.name == "default":
-            return f"{self.tag}"
+        if self.repo.name == 'default':
+            return f'{self.tag}'
         else:
-            return f"{self.repo.name}/{self.tag}"
+            return f'{self.repo.name}/{self.tag}'
 
     def __hash__(self):  # type: ignore
         return md5(str(self.path))
@@ -164,12 +160,12 @@ def __hash__(self):  # type: ignore
     @property
     def tag(self) -> str:
         if self.alias:
-            return f"{self.path.parent.name}:{self.alias}"
-        return f"{self.path.parent.name}:{self.path.name}"
+            return f'{self.path.parent.name}:{self.alias}'
+        return f'{self.path.parent.name}:{self.path.name}'
 
     @property
     def bentoml_tag(self) -> str:
-        return f"{self.path.parent.name}:{self.path.name}"
+        return f'{self.path.parent.name}:{self.path.name}'
 
     @property
     def name(self) -> str:
@@ -181,42 +177,40 @@ def version(self) -> str:
 
     @property
     def labels(self) -> dict[str, str]:
-        return self.bento_yaml["labels"]
+        return self.bento_yaml['labels']
 
     @property
     def envs(self) -> list[dict[str, str]]:
-        return self.bento_yaml["envs"]
+        return self.bento_yaml['envs']
 
     @functools.cached_property
     def bento_yaml(self) -> dict:
         import yaml
 
-        bento_file = self.path / "bento.yaml"
+        bento_file = self.path / 'bento.yaml'
         return yaml.safe_load(bento_file.read_text())
 
     @functools.cached_property
     def platforms(self) -> list[str]:
-        return self.bento_yaml["labels"].get("platforms", "linux").split(",")
+        return self.bento_yaml['labels'].get('platforms', 'linux').split(',')
 
     @functools.cached_property
     def pretty_yaml(self) -> dict:
         def _pretty_routes(routes):
             return {
-                route["route"]: {
-                    "input": {
-                        k: v["type"] for k, v in route["input"]["properties"].items()
-                    },
-                    "output": route["output"]["type"],
+                route['route']: {
+                    'input': {k: v['type'] for k, v in route['input']['properties'].items()},
+                    'output': route['output']['type'],
                 }
                 for route in routes
             }
 
-        if len(self.bento_yaml["services"]) == 1:
+        if len(self.bento_yaml['services']) == 1:
             pretty_yaml = {
-                "apis": _pretty_routes(self.bento_yaml["schema"]["routes"]),
-                "resources": self.bento_yaml["services"][0]["config"]["resources"],
-                "envs": self.bento_yaml["envs"],
-                "platforms": self.platforms,
+                'apis': _pretty_routes(self.bento_yaml['schema']['routes']),
+                'resources': self.bento_yaml['services'][0]['config']['resources'],
+                'envs': self.bento_yaml['envs'],
+                'platforms': self.platforms,
             }
             return pretty_yaml
         return self.bento_yaml
@@ -226,41 +220,31 @@ def pretty_gpu(self) -> str:
         from openllm.accelerator_spec import ACCELERATOR_SPECS
 
         try:
-            resources = self.bento_yaml["services"][0]["config"]["resources"]
-            if resources["gpu"] > 1:
-                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
+            resources = self.bento_yaml['services'][0]['config']['resources']
+            if resources['gpu'] > 1:
+                acc = ACCELERATOR_SPECS[resources['gpu_type']]
                 return f"{acc.memory_size:.0f}Gx{resources['gpu']}"
-            elif resources["gpu"] > 0:
-                acc = ACCELERATOR_SPECS[resources["gpu_type"]]
-                return f"{acc.memory_size:.0f}G"
+            elif resources['gpu'] > 0:
+                acc = ACCELERATOR_SPECS[resources['gpu_type']]
+                return f'{acc.memory_size:.0f}G'
         except KeyError:
             pass
-        return ""
+        return ''
 
     def tolist(self):
         verbose = VERBOSE_LEVEL.get()
         if verbose <= 0:
             return str(self)
         if verbose <= 10:
-            return dict(
-                tag=self.tag,
-                repo=self.repo.tolist(),
-                path=str(self.path),
-                model_card=self.pretty_yaml,
-            )
+            return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), model_card=self.pretty_yaml)
         if verbose <= 20:
-            return dict(
-                tag=self.tag,
-                repo=self.repo.tolist(),
-                path=str(self.path),
-                bento_yaml=self.bento_yaml,
-            )
+            return dict(tag=self.tag, repo=self.repo.tolist(), path=str(self.path), bento_yaml=self.bento_yaml)
 
 
 class VenvSpec(SimpleNamespace):
     python_version: str
     requirements_txt: str
-    name_prefix = ""
+    name_prefix = ''
     envs: EnvVars
 
     @functools.cached_property
@@ -272,27 +256,23 @@ def normalized_requirements_txt(self) -> str:
         for line in self.requirements_txt.splitlines():
             if not line.strip():
                 continue
-            elif line.strip().startswith("#"):
+            elif line.strip().startswith('#'):
                 comment_lines.append(line.strip())
-            elif line.strip().startswith("-"):
+            elif line.strip().startswith('-'):
                 parameter_lines.append(line.strip())
             else:
                 dependency_lines.append(line.strip())
 
         parameter_lines.sort()
         dependency_lines.sort()
-        return "\n".join(parameter_lines + dependency_lines).strip()
+        return '\n'.join(parameter_lines + dependency_lines).strip()
 
     @functools.cached_property
     def normalized_envs(self) -> str:
         """
         sorted by name
         """
-        return "\n".join(
-            f"{k}={v}"
-            for k, v in sorted(self.envs.items(), key=lambda x: x[0])
-            if not v
-        )
+        return '\n'.join(f'{k}={v}' for k, v in sorted(self.envs.items(), key=lambda x: x[0]) if not v)
 
     def __hash__(self):  # type: ignore
         return md5(
@@ -314,10 +294,10 @@ def __eq__(self, other):
 
 
 class DeploymentTarget(SimpleNamespace):
-    source: str = "local"
-    name: str = "local"
-    price: str = ""
-    platform = "linux"
+    source: str = 'local'
+    name: str = 'local'
+    price: str = ''
+    platform = 'linux'
     accelerators: list[Accelerator]
 
     def __hash__(self):  # type: ignore
@@ -327,31 +307,29 @@ def __hash__(self):  # type: ignore
     def accelerators_repr(self) -> str:
         accs = {a.model for a in self.accelerators}
         if len(accs) == 0:
-            return "null"
+            return 'null'
         if len(accs) == 1:
             a = self.accelerators[0]
-            return f"{a.model} x{len(self.accelerators)}"
-        return ", ".join((f"{a.model}" for a in self.accelerators))
+            return f'{a.model} x{len(self.accelerators)}'
+        return ', '.join((f'{a.model}' for a in self.accelerators))
 
 
-def run_command(
-    cmd, cwd=None, env=None, copy_env=True, venv=None, silent=False
-) -> subprocess.CompletedProcess:
+def run_command(cmd, cwd=None, env=None, copy_env=True, venv=None, silent=False) -> subprocess.CompletedProcess:
     import shlex
 
     env = env or {}
     cmd = [str(c) for c in cmd]
-    bin_dir = "Scripts" if os.name == "nt" else "bin"
+    bin_dir = 'Scripts' if os.name == 'nt' else 'bin'
     if not silent:
-        output("\n")
+        output('\n')
         if cwd:
-            output(f"$ cd {cwd}", style="orange")
+            output(f'$ cd {cwd}', style='orange')
         if env:
             for k, v in env.items():
-                output(f"$ export {k}={shlex.quote(v)}", style="orange")
+                output(f'$ export {k}={shlex.quote(v)}', style='orange')
         if venv:
-            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
-        output(f"$ {' '.join(cmd)}", style="orange")
+            output(f"$ source {venv / 'bin' / 'activate'}", style='orange')
+        output(f"$ {' '.join(cmd)}", style='orange')
 
     if venv:
         py = venv / bin_dir / f"python{sysconfig.get_config_var('EXE')}"
@@ -361,80 +339,69 @@ def run_command(
     if copy_env:
         env = {**os.environ, **env}
 
-    if cmd and cmd[0] == "bentoml":
-        cmd = [py, "-m", "bentoml"] + cmd[1:]
-    if cmd and cmd[0] == "python":
+    if cmd and cmd[0] == 'bentoml':
+        cmd = [py, '-m', 'bentoml'] + cmd[1:]
+    if cmd and cmd[0] == 'python':
         cmd = [py] + cmd[1:]
 
     try:
         if silent:
             return subprocess.run(  # type: ignore
-                cmd,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-                check=True,
+                cmd, cwd=cwd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
             )
         else:
             return subprocess.run(cmd, cwd=cwd, env=env, check=True)
     except Exception as e:
         if VERBOSE_LEVEL.get() >= 10:
-            output(e, style="red")
-        output("Command failed", style="red")
+            output(e, style='red')
+        output('Command failed', style='red')
         raise typer.Exit(1)
 
 
-async def stream_command_output(stream, style="gray"):
+async def stream_command_output(stream, style='gray'):
     async for line in stream:
-        output(line.decode(), style=style, end="")
+        output(line.decode(), style=style, end='')
 
 
 @asynccontextmanager
-async def async_run_command(
-    cmd, cwd=None, env=None, copy_env=True, venv=None, silent=True
-):
+async def async_run_command(cmd, cwd=None, env=None, copy_env=True, venv=None, silent=True):
     import shlex
 
     env = env or {}
     cmd = [str(c) for c in cmd]
 
     if not silent:
-        output("\n")
+        output('\n')
         if cwd:
-            output(f"$ cd {cwd}", style="orange")
+            output(f'$ cd {cwd}', style='orange')
         if env:
             for k, v in env.items():
-                output(f"$ export {k}={shlex.quote(v)}", style="orange")
+                output(f'$ export {k}={shlex.quote(v)}', style='orange')
         if venv:
-            output(f"$ source {venv / 'bin' / 'activate'}", style="orange")
-        output(f"$ {' '.join(cmd)}", style="orange")
+            output(f"$ source {venv / 'bin' / 'activate'}", style='orange')
+        output(f"$ {' '.join(cmd)}", style='orange')
 
     if venv:
-        py = venv / "bin" / "python"
+        py = venv / 'bin' / 'python'
     else:
         py = sys.executable
 
     if copy_env:
         env = {**os.environ, **env}
 
-    if cmd and cmd[0] == "bentoml":
-        cmd = [py, "-m", "bentoml"] + cmd[1:]
-    if cmd and cmd[0] == "python":
+    if cmd and cmd[0] == 'bentoml':
+        cmd = [py, '-m', 'bentoml'] + cmd[1:]
+    if cmd and cmd[0] == 'python':
         cmd = [py] + cmd[1:]
 
     proc = None
     try:
         proc = await asyncio.create_subprocess_shell(
-            " ".join(map(str, cmd)),
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-            cwd=cwd,
-            env=env,
+            ' '.join(map(str, cmd)), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd, env=env
         )
         yield proc
     except subprocess.CalledProcessError:
-        output("Command failed", style="red")
+        output('Command failed', style='red')
         raise typer.Exit(1)
     finally:
         if proc:
diff --git a/src/openllm/local.py b/src/openllm/local.py
index bb1655c17..e474cc105 100644
--- a/src/openllm/local.py
+++ b/src/openllm/local.py
@@ -4,8 +4,7 @@
 
 import httpx
 
-from openllm.common import (BentoInfo, EnvVars, async_run_command, output,
-                            run_command, stream_command_output)
+from openllm.common import BentoInfo, EnvVars, async_run_command, output, run_command, stream_command_output
 from openllm.venv import ensure_venv
 
 
@@ -14,20 +13,18 @@ def prep_env_vars(bento: BentoInfo):
 
     env_vars = bento.envs
     for env_var in env_vars:
-        if "value" not in env_var:
+        if 'value' not in env_var:
             continue
-        key = env_var["name"]
-        value = env_var["value"]
+        key = env_var['name']
+        value = env_var['value']
         os.environ[key] = value
 
 
-def _get_serve_cmd(
-    bento: BentoInfo, port: int = 3000
-) -> tuple[list[str], EnvVars, Optional[str]]:
-    cmd = ["bentoml", "serve", bento.bentoml_tag]
+def _get_serve_cmd(bento: BentoInfo, port: int = 3000) -> tuple[list[str], EnvVars, Optional[str]]:
+    cmd = ['bentoml', 'serve', bento.bentoml_tag]
     if port != 3000:
-        cmd += ["--port", str(port)]
-    env = EnvVars({"BENTOML_HOME": f"{bento.repo.path}/bentoml"})
+        cmd += ['--port', str(port)]
+    env = EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})
     return cmd, env, None
 
 
@@ -35,41 +32,37 @@ def serve(bento: BentoInfo, port: int = 3000):
     prep_env_vars(bento)
     cmd, env, cwd = _get_serve_cmd(bento, port=port)
     venv = ensure_venv(bento, runtime_envs=env)
-    output(f"Access the Chat UI at http://localhost:{port}/chat (or with you IP)")
+    output(f'Access the Chat UI at http://localhost:{port}/chat (or with you IP)')
     run_command(cmd, env=env, cwd=cwd, venv=venv)
 
 
 async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600):
     cmd, env, cwd = _get_serve_cmd(bento, port)
     venv = ensure_venv(bento, runtime_envs=env)
-    async with async_run_command(
-        cmd, env=env, cwd=cwd, venv=venv, silent=False
-    ) as server_proc:
-        output(f"Model server started {server_proc.pid}")
+    async with async_run_command(cmd, env=env, cwd=cwd, venv=venv, silent=False) as server_proc:
+        output(f'Model server started {server_proc.pid}')
 
         stdout_streamer = None
         stderr_streamer = None
         start_time = time.time()
 
-        output("Model loading...", style="green")
+        output('Model loading...', style='green')
         for _ in range(timeout):
             try:
-                resp = httpx.get(f"http://localhost:{port}/readyz", timeout=3)
+                resp = httpx.get(f'http://localhost:{port}/readyz', timeout=3)
                 if resp.status_code == 200:
                     break
             except httpx.RequestError:
                 if time.time() - start_time > 30:
                     if not stdout_streamer:
-                        stdout_streamer = asyncio.create_task(
-                            stream_command_output(server_proc.stdout, style="gray")
-                        )
+                        stdout_streamer = asyncio.create_task(stream_command_output(server_proc.stdout, style='gray'))
                     if not stderr_streamer:
                         stderr_streamer = asyncio.create_task(
-                            stream_command_output(server_proc.stderr, style="#BD2D0F")
+                            stream_command_output(server_proc.stderr, style='#BD2D0F')
                         )
                 await asyncio.sleep(1)
         else:
-            output("Model failed to load", style="red")
+            output('Model failed to load', style='red')
             server_proc.terminate()
             return
 
@@ -78,37 +71,37 @@ async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600):
         if stderr_streamer:
             stderr_streamer.cancel()
 
-        output("Model is ready", style="green")
+        output('Model is ready', style='green')
         messages: list[dict[str, str]] = []
 
         from openai import AsyncOpenAI
 
-        client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1", api_key="local")
+        client = AsyncOpenAI(base_url=f'http://localhost:{port}/v1', api_key='local')
         model_id = (await client.models.list()).data[0].id
         while True:
             try:
-                message = input("user: ")
-                if message == "":
-                    output("empty message, please enter something", style="yellow")
+                message = input('user: ')
+                if message == '':
+                    output('empty message, please enter something', style='yellow')
                     continue
-                messages.append(dict(role="user", content=message))
-                output("assistant: ", end="", style="lightgreen")
-                assistant_message = ""
+                messages.append(dict(role='user', content=message))
+                output('assistant: ', end='', style='lightgreen')
+                assistant_message = ''
                 stream = await client.chat.completions.create(
                     model=model_id,
                     messages=messages,  # type: ignore
                     stream=True,
                 )
                 async for chunk in stream:
-                    text = chunk.choices[0].delta.content or ""
+                    text = chunk.choices[0].delta.content or ''
                     assistant_message += text
-                    output(text, end="", style="lightgreen")
-                messages.append(dict(role="assistant", content=assistant_message))
-                output("")
+                    output(text, end='', style='lightgreen')
+                messages.append(dict(role='assistant', content=assistant_message))
+                output('')
             except KeyboardInterrupt:
                 break
-        output("\nStopping model server...", style="green")
-    output("Stopped model server", style="green")
+        output('\nStopping model server...', style='green')
+    output('Stopped model server', style='green')
 
 
 def run(bento: BentoInfo, port: int = 3000, timeout: int = 600):
diff --git a/src/openllm/model.py b/src/openllm/model.py
index 655647c5a..8e78edbe7 100644
--- a/src/openllm/model.py
+++ b/src/openllm/model.py
@@ -10,10 +10,10 @@
 from openllm.common import VERBOSE_LEVEL, BentoInfo, load_config, output
 from openllm.repo import ensure_repo_updated, parse_repo_url
 
-app = OpenLLMTyper(help="manage models")
+app = OpenLLMTyper(help='manage models')
 
 
-@app.command(help="get model")
+@app.command(help='get model')
 def get(tag: str, repo: Optional[str] = None, verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
@@ -22,10 +22,8 @@ def get(tag: str, repo: Optional[str] = None, verbose: bool = False):
         output(bento_info)
 
 
-@app.command(name="list", help="list available models")
-def list_model(
-    tag: Optional[str] = None, repo: Optional[str] = None, verbose: bool = False
-):
+@app.command(name='list', help='list available models')
+def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
 
@@ -43,46 +41,42 @@ def is_seen(value):
     table = tabulate.tabulate(
         [
             [
-                "" if is_seen(bento.name) else bento.name,
+                '' if is_seen(bento.name) else bento.name,
                 bento.tag,
                 bento.repo.name,
                 bento.pretty_gpu,
-                ",".join(bento.platforms),
+                ','.join(bento.platforms),
             ]
             for bento in bentos
         ],
-        headers=["model", "version", "repo", "required GPU RAM", "platforms"],
+        headers=['model', 'version', 'repo', 'required GPU RAM', 'platforms'],
     )
     output(table)
 
 
-def ensure_bento(
-    model: str,
-    target: Optional[DeploymentTarget] = None,
-    repo_name: Optional[str] = None,
-) -> BentoInfo:
+def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_name: Optional[str] = None) -> BentoInfo:
     bentos = list_bento(model, repo_name=repo_name)
     if len(bentos) == 0:
-        output(f"No model found for {model}", style="red")
+        output(f'No model found for {model}', style='red')
         raise typer.Exit(1)
 
     if len(bentos) == 1:
-        output(f"Found model {bentos[0]}", style="green")
+        output(f'Found model {bentos[0]}', style='green')
         if target is not None and can_run(bentos[0], target) <= 0:
             output(
-                f"The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient "
-                f"resources to run model {bentos[0]}\n",
-                style="yellow",
+                f'The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient '
+                f'resources to run model {bentos[0]}\n',
+                style='yellow',
             )
         return bentos[0]
 
     # multiple models, pick one according to target
-    output(f"Multiple models match {model}, did you mean one of these?", style="red")
+    output(f'Multiple models match {model}, did you mean one of these?', style='red')
     list_model(model, repo=repo_name)
     raise typer.Exit(1)
 
 
-NUMBER_RE = re.compile(r"\d+")
+NUMBER_RE = re.compile(r'\d+')
 
 
 def _extract_first_number(s: str):
@@ -94,30 +88,28 @@ def _extract_first_number(s: str):
 
 
 def list_bento(
-    tag: typing.Optional[str] = None,
-    repo_name: typing.Optional[str] = None,
-    include_alias: bool = False,
+    tag: typing.Optional[str] = None, repo_name: typing.Optional[str] = None, include_alias: bool = False
 ) -> typing.List[BentoInfo]:
     ensure_repo_updated()
 
-    if repo_name is None and tag and "/" in tag:
-        repo_name, tag = tag.split("/", 1)
+    if repo_name is None and tag and '/' in tag:
+        repo_name, tag = tag.split('/', 1)
 
     if repo_name is not None:
         config = load_config()
         if repo_name not in config.repos:
-            output(f"Repo `{repo_name}` not found, did you mean one of these?")
+            output(f'Repo `{repo_name}` not found, did you mean one of these?')
             for repo_name in config.repos:
-                output(f"  {repo_name}")
+                output(f'  {repo_name}')
             raise typer.Exit(1)
 
     if not tag:
-        glob_pattern = "bentoml/bentos/*/*"
-    elif ":" in tag:
-        bento_name, version = tag.split(":")
-        glob_pattern = f"bentoml/bentos/{bento_name}/{version}"
+        glob_pattern = 'bentoml/bentos/*/*'
+    elif ':' in tag:
+        bento_name, version = tag.split(':')
+        glob_pattern = f'bentoml/bentos/{bento_name}/{version}'
     else:
-        glob_pattern = f"bentoml/bentos/{tag}/*"
+        glob_pattern = f'bentoml/bentos/{tag}/*'
 
     model_list = []
     config = load_config()
@@ -128,15 +120,10 @@ def list_bento(
 
         paths = sorted(
             repo.path.glob(glob_pattern),
-            key=lambda x: (
-                x.parent.name,
-                _extract_first_number(x.name),
-                len(x.name),
-                x.name,
-            ),
+            key=lambda x: (x.parent.name, _extract_first_number(x.name), len(x.name), x.name),
         )
         for path in paths:
-            if path.is_dir() and (path / "bento.yaml").exists():
+            if path.is_dir() and (path / 'bento.yaml').exists():
                 model = BentoInfo(repo=repo, path=path)
             elif path.is_file():
                 with open(path) as f:
diff --git a/src/openllm/repo.py b/src/openllm/repo.py
index caec0b73e..e6e43f390 100644
--- a/src/openllm/repo.py
+++ b/src/openllm/repo.py
@@ -8,46 +8,43 @@
 import typer
 
 from openllm.analytic import OpenLLMTyper
-from openllm.common import (INTERACTIVE, REPO_DIR, VERBOSE_LEVEL, RepoInfo,
-                            load_config, output, save_config)
+from openllm.common import INTERACTIVE, REPO_DIR, VERBOSE_LEVEL, RepoInfo, load_config, output, save_config
 
 UPDATE_INTERVAL = datetime.timedelta(days=3)
 
-app = OpenLLMTyper(help="manage repos")
+app = OpenLLMTyper(help='manage repos')
 
 
-@app.command(name="list", help="list available repo")
+@app.command(name='list', help='list available repo')
 def list_repo(verbose: bool = False):
     if verbose:
         VERBOSE_LEVEL.set(20)
     config = load_config()
     pyaml.pprint(
-        [parse_repo_url(repo, name) for name, repo in config.repos.items()],
-        sort_dicts=False,
-        sort_keys=False,
+        [parse_repo_url(repo, name) for name, repo in config.repos.items()], sort_dicts=False, sort_keys=False
     )
 
 
-@app.command(help="remove given repo")
+@app.command(help='remove given repo')
 def remove(name: str):
     config = load_config()
     if name not in config.repos:
-        output(f"Repo {name} does not exist", style="red")
+        output(f'Repo {name} does not exist', style='red')
         return
 
     del config.repos[name]
     save_config(config)
-    output(f"Repo {name} removed", style="green")
+    output(f'Repo {name} removed', style='green')
 
 
 def _complete_alias(repo_name: str):
     from openllm.model import list_bento
 
     for bento in list_bento(repo_name=repo_name):
-        alias = bento.labels.get("openllm_alias", "").strip()
+        alias = bento.labels.get('openllm_alias', '').strip()
         if alias:
-            for a in alias.split(","):
-                with open(bento.path.parent / a, "w") as f:
+            for a in alias.split(','):
+                with open(bento.path.parent / a, 'w') as f:
                     f.write(bento.version)
 
 
@@ -58,20 +55,15 @@ def _clone_repo(repo: RepoInfo):
     import subprocess
 
     try:
-        subprocess.run(
-            ["git", "clone", "--depth=1", "-b", repo.branch, repo.url, str(repo.path)],
-            check=True,
-        )
+        subprocess.run(['git', 'clone', '--depth=1', '-b', repo.branch, repo.url, str(repo.path)], check=True)
     except (subprocess.CalledProcessError, FileNotFoundError):
         import dulwich
         import dulwich.porcelain
 
-        dulwich.porcelain.clone(
-            repo.url, str(repo.path), checkout=True, depth=1, branch=repo.branch
-        )
+        dulwich.porcelain.clone(repo.url, str(repo.path), checkout=True, depth=1, branch=repo.branch)
 
 
-@app.command(help="update default repo")
+@app.command(help='update default repo')
 def update():
     config = load_config()
     repos_in_use = set()
@@ -83,59 +75,59 @@ def update():
         repo.path.parent.mkdir(parents=True, exist_ok=True)
         try:
             _clone_repo(repo)
-            output("")
-            output(f"Repo `{repo.name}` updated", style="green")
+            output('')
+            output(f'Repo `{repo.name}` updated', style='green')
         except Exception as e:
             shutil.rmtree(repo.path, ignore_errors=True)
-            output(f"Failed to clone repo {repo.name}", style="red")
+            output(f'Failed to clone repo {repo.name}', style='red')
             output(e)
-    for c in REPO_DIR.glob("*/*/*/*"):
+    for c in REPO_DIR.glob('*/*/*/*'):
         repo_spec = tuple(c.parts[-4:])
         if repo_spec not in repos_in_use:
             shutil.rmtree(c, ignore_errors=True)
-            output(f"Removed unused repo cache {c}")
-    with open(REPO_DIR / "last_update", "w") as f:
+            output(f'Removed unused repo cache {c}')
+    with open(REPO_DIR / 'last_update', 'w') as f:
         f.write(datetime.datetime.now().isoformat())
     for repo_name in config.repos:
         _complete_alias(repo_name)
 
 
 def ensure_repo_updated():
-    last_update_file = REPO_DIR / "last_update"
+    last_update_file = REPO_DIR / 'last_update'
     if not last_update_file.exists():
         if INTERACTIVE.get():
             choice = questionary.confirm(
-                "The repo cache is never updated, do you want to update it to fetch the latest model list?"
+                'The repo cache is never updated, do you want to update it to fetch the latest model list?'
             ).ask()
             if choice:
                 update()
             return
         else:
             output(
-                "The repo cache is never updated, please run `openllm repo update` to fetch the latest model list",
-                style="red",
+                'The repo cache is never updated, please run `openllm repo update` to fetch the latest model list',
+                style='red',
             )
             raise typer.Exit(1)
     last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())
     if datetime.datetime.now() - last_update > UPDATE_INTERVAL:
         if INTERACTIVE.get():
             choice = questionary.confirm(
-                "The repo cache is outdated, do you want to update it to fetch the latest model list?"
+                'The repo cache is outdated, do you want to update it to fetch the latest model list?'
             ).ask()
             if choice:
                 update()
         else:
             output(
-                "The repo cache is outdated, please run `openllm repo update` to fetch the latest model list",
-                style="yellow",
+                'The repo cache is outdated, please run `openllm repo update` to fetch the latest model list',
+                style='yellow',
             )
 
 
 GIT_HTTP_RE = re.compile(
-    r"(?P<schema>git|ssh|http|https):\/\/(?P<server>[\.\w\d\-]+)\/(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$"
+    r'(?P<schema>git|ssh|http|https):\/\/(?P<server>[\.\w\d\-]+)\/(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$'
 )
 GIT_SSH_RE = re.compile(
-    r"git@(?P<server>[\.\w\d-]+):(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$"
+    r'git@(?P<server>[\.\w\d-]+):(?P<owner>[\w\d\-]+)\/(?P<repo>[\w\d\-\_\.]+)(@(?P<branch>.+))?(\/)?$'
 )
 
 
@@ -156,27 +148,27 @@ def parse_repo_url(repo_url: str, repo_name: typing.Optional[str] = None) -> Rep
     """
     match = GIT_HTTP_RE.match(repo_url)
     if match:
-        schema = match.group("schema")
+        schema = match.group('schema')
     else:
         match = GIT_SSH_RE.match(repo_url)
         if not match:
-            raise ValueError(f"Invalid git repo url: {repo_url}")
+            raise ValueError(f'Invalid git repo url: {repo_url}')
         schema = None
 
-    if match.group("branch") is not None:
-        repo_url = repo_url[: match.start("branch") - 1]
+    if match.group('branch') is not None:
+        repo_url = repo_url[: match.start('branch') - 1]
 
-    server = match.group("server")
-    owner = match.group("owner")
-    repo = match.group("repo")
-    if repo.endswith(".git"):
+    server = match.group('server')
+    owner = match.group('owner')
+    repo = match.group('repo')
+    if repo.endswith('.git'):
         repo = repo[:-4]
-    branch = match.group("branch") or "main"
+    branch = match.group('branch') or 'main'
 
     if schema is not None:
-        repo_url = f"{schema}://{server}/{owner}/{repo}"
+        repo_url = f'{schema}://{server}/{owner}/{repo}'
     else:
-        repo_url = f"git@{server}:{owner}/{repo}"
+        repo_url = f'git@{server}:{owner}/{repo}'
 
     path = REPO_DIR / server / owner / repo / branch
     return RepoInfo(
@@ -190,40 +182,35 @@ def parse_repo_url(repo_url: str, repo_name: typing.Optional[str] = None) -> Rep
     )
 
 
-@app.command(help="add new repo")
+@app.command(help='add new repo')
 def add(name: str, repo: str):
     name = name.lower()
     if not name.isidentifier():
-        output(
-            f"Invalid repo name: {name}, should only contain letters, numbers and underscores",
-            style="red",
-        )
+        output(f'Invalid repo name: {name}, should only contain letters, numbers and underscores', style='red')
         return
 
     try:
         parse_repo_url(repo)
     except ValueError:
-        output(f"Invalid repo url: {repo}", style="red")
+        output(f'Invalid repo url: {repo}', style='red')
         return
 
     config = load_config()
     if name in config.repos:
-        override = questionary.confirm(
-            f"Repo {name} already exists({config.repos[name]}), override?"
-        ).ask()
+        override = questionary.confirm(f'Repo {name} already exists({config.repos[name]}), override?').ask()
         if not override:
             return
 
     config.repos[name] = repo
     save_config(config)
-    output(f"Repo {name} added", style="green")
+    output(f'Repo {name} added', style='green')
 
 
-@app.command(help="get default repo path")
+@app.command(help='get default repo path')
 def default():
-    output((info := parse_repo_url(load_config().repos["default"], "default")).path)
+    output((info := parse_repo_url(load_config().repos['default'], 'default')).path)
     return info.path
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
diff --git a/src/openllm/venv.py b/src/openllm/venv.py
index f68e08f21..084b6555d 100644
--- a/src/openllm/venv.py
+++ b/src/openllm/venv.py
@@ -7,97 +7,67 @@
 import typer
 import yaml
 
-from openllm.common import (VENV_DIR, VERBOSE_LEVEL, BentoInfo, EnvVars,
-                            VenvSpec, output, run_command)
+from openllm.common import VENV_DIR, VERBOSE_LEVEL, BentoInfo, EnvVars, VenvSpec, output, run_command
 
 
 @functools.lru_cache
-def _resolve_bento_venv_spec(
-    bento: BentoInfo, runtime_envs: Optional[EnvVars] = None,
-) -> VenvSpec:
-    ver_file = bento.path / "env" / "python" / "version.txt"
-    assert ver_file.exists(), f"cannot find version file in {bento.path}"
+def _resolve_bento_venv_spec(bento: BentoInfo, runtime_envs: Optional[EnvVars] = None) -> VenvSpec:
+    ver_file = bento.path / 'env' / 'python' / 'version.txt'
+    assert ver_file.exists(), f'cannot find version file in {bento.path}'
 
-    lock_file = bento.path / "env" / "python" / "requirements.lock.txt"
+    lock_file = bento.path / 'env' / 'python' / 'requirements.lock.txt'
     if not lock_file.exists():
-        lock_file = bento.path / "env" / "python" / "requirements.txt"
+        lock_file = bento.path / 'env' / 'python' / 'requirements.txt'
 
     ver = ver_file.read_text().strip()
     reqs = lock_file.read_text().strip()
-    bentofile = bento.path / "bento.yaml"
-    bento_env_list = yaml.safe_load(bentofile.read_text()).get("envs", [])
-    bento_envs = {e["name"]: e.get("value") for e in bento_env_list}
-    envs = (
-        {k: runtime_envs.get(k, v) for k, v in bento_envs.items()}
-        if runtime_envs
-        else {}
-    )
+    bentofile = bento.path / 'bento.yaml'
+    bento_env_list = yaml.safe_load(bentofile.read_text()).get('envs', [])
+    bento_envs = {e['name']: e.get('value') for e in bento_env_list}
+    envs = {k: runtime_envs.get(k, v) for k, v in bento_envs.items()} if runtime_envs else {}
 
     return VenvSpec(
-        python_version=ver,
-        requirements_txt=reqs,
-        name_prefix=f"{bento.tag.replace(':', '_')}-1-",
-        envs=EnvVars(envs),
+        python_version=ver, requirements_txt=reqs, name_prefix=f"{bento.tag.replace(':', '_')}-1-", envs=EnvVars(envs)
     )
 
 
 def _ensure_venv(venv_spec: VenvSpec) -> pathlib.Path:
     venv = VENV_DIR / str(hash(venv_spec))
-    if venv.exists() and not (venv / "DONE").exists():
+    if venv.exists() and not (venv / 'DONE').exists():
         shutil.rmtree(venv, ignore_errors=True)
     if not venv.exists():
-        output(f"Installing model dependencies({venv})...", style="green")
+        output(f'Installing model dependencies({venv})...', style='green')
 
-        venv_py = (
-            venv / "Scripts" / "python.exe"
-            if os.name == "nt"
-            else venv / "bin" / "python"
-        )
+        venv_py = venv / 'Scripts' / 'python.exe' if os.name == 'nt' else venv / 'bin' / 'python'
         try:
+            run_command(['python', '-m', 'uv', 'venv', venv], silent=VERBOSE_LEVEL.get() < 10)
             run_command(
-                ["python", "-m", "uv", "venv", venv], silent=VERBOSE_LEVEL.get() < 10
-            )
-            run_command(
-                ["python", "-m", "uv", "pip", "install", "-p", str(venv_py), "bentoml"],
+                ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), 'bentoml'],
                 silent=VERBOSE_LEVEL.get() < 10,
                 env=venv_spec.envs,
             )
-            with open(venv / "requirements.txt", "w") as f:
+            with open(venv / 'requirements.txt', 'w') as f:
                 f.write(venv_spec.normalized_requirements_txt)
             run_command(
-                [
-                    "python",
-                    "-m",
-                    "uv",
-                    "pip",
-                    "install",
-                    "-p",
-                    str(venv_py),
-                    "-r",
-                    venv / "requirements.txt",
-                ],
+                ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), '-r', venv / 'requirements.txt'],
                 silent=VERBOSE_LEVEL.get() < 10,
                 env=venv_spec.envs,
             )
-            with open(venv / "DONE", "w") as f:
-                f.write("DONE")
+            with open(venv / 'DONE', 'w') as f:
+                f.write('DONE')
         except Exception as e:
             shutil.rmtree(venv, ignore_errors=True)
             if VERBOSE_LEVEL.get() >= 10:
-                output(e, style="red")
-            output(
-                f"Failed to install dependencies to {venv}. Cleaned up.", style="red"
-            )
+                output(e, style='red')
+            output(f'Failed to install dependencies to {venv}. Cleaned up.', style='red')
             raise typer.Exit(1)
-        output(f"Successfully installed dependencies to {venv}.", style="green")
+        output(f'Successfully installed dependencies to {venv}.', style='green')
         return venv
     else:
         return venv
 
 
-def ensure_venv(
-    bento: BentoInfo, runtime_envs: Optional[EnvVars] = None
-) -> pathlib.Path:
+def ensure_venv(bento: BentoInfo, runtime_envs: Optional[EnvVars] = None) -> pathlib.Path:
     venv_spec = _resolve_bento_venv_spec(bento, runtime_envs=EnvVars(runtime_envs))
     venv = _ensure_venv(venv_spec)
     assert venv is not None
@@ -109,6 +79,6 @@ def check_venv(bento: BentoInfo) -> bool:
     venv = VENV_DIR / str(hash(venv_spec))
     if not venv.exists():
         return False
-    if venv.exists() and not (venv / "DONE").exists():
+    if venv.exists() and not (venv / 'DONE').exists():
         return False
     return True