Add Optimum Intel (#2609)

stanford-crfm · May 13, 2024 · f868c99 · f868c99
1 parent 6bd30be
commit f868c99
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 11 deletions.
diff --git a/docs/huggingface_models.md b/docs/huggingface_models.md
@@ -29,3 +29,17 @@ helm-run \
     --suite v1 \
     --max-eval-instances 10
 ```
+
+To use Optimum Intel, add `--openvino` flag to `helm-run`. Optimum Intel provides a simple interface to optimize Transformer models and convert them to OpenVINO™ Intermediate Representation format to accelerate end-to-end pipelines on Intel® architectures using OpenVINO™ runtime. It runs the model on the CPU.
+
+Examples:
+
+```bash
+# Run boolq on stanford-crfm/BioMedLM optimized by Optimum Intel OpenNIVO
+helm-run \
+    --run-entries boolq:model=stanford-crfm/BioMedLM \
+    --enable-huggingface-models stanford-crfm/BioMedLM \
+    --suite v1 \
+    --max-eval-instances 10 \
+    --openvino 
+```
diff --git a/setup.cfg b/setup.cfg
@@ -114,6 +114,8 @@ unitxt =
 aleph-alpha =
     aleph-alpha-client~=2.14.0
     tokenizers>=0.13.3
+openvino =
+    optimum[openvino]~=1.19
 
 allenai =
     ai2-olmo~=0.2
@@ -158,6 +160,7 @@ models =
     crfm-helm[together]
     crfm-helm[tsinghua]
     crfm-helm[yandex]
+    crfm-helm[openvino]
 
 vlm =
     crfm-helm[openai]

diff --git a/src/helm/benchmark/huggingface_registration.py b/src/helm/benchmark/huggingface_registration.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+from typing import Optional, Dict, Union
 
 from helm.benchmark.model_deployment_registry import (
     ClientSpec,
@@ -17,11 +17,16 @@
 
 
 def register_huggingface_model(
-    helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
+    helm_model_name: str,
+    pretrained_model_name_or_path: str,
+    revision: Optional[str] = None,
+    openvino: Optional[bool] = False,
 ) -> None:
-    object_spec_args = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
+    object_spec_args: Dict[str, Union[str, bool]] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
     if revision:
         object_spec_args["revision"] = revision
+    if openvino:
+        object_spec_args["openvino"] = openvino
 
     # Auto-infer model properties from the tokenizer.
     with HuggingFaceTokenizer.create_tokenizer(**object_spec_args) as tokenizer:
@@ -71,7 +76,7 @@ def register_huggingface_model(
     register_tokenizer_config(tokenizer_config)
 
 
-def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> None:
+def register_huggingface_hub_model_from_flag_value(raw_model_string: str, openvino=False) -> None:
     raw_model_string_parts = raw_model_string.split("@")
     pretrained_model_name_or_path: str
     revision: Optional[str]
@@ -88,15 +93,17 @@ def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> Non
         helm_model_name=raw_model_string,
         pretrained_model_name_or_path=pretrained_model_name_or_path,
         revision=revision,
+        openvino=openvino,
     )
 
 
-def register_huggingface_local_model_from_flag_value(path: str) -> None:
+def register_huggingface_local_model_from_flag_value(path: str, openvino=False) -> None:
     if not path:
         raise ValueError("Path to Hugging Face model must be non-empty")
     path_parts = os.path.split(path)
     helm_model_name = f"huggingface/{path_parts[-1]}"
     register_huggingface_model(
         helm_model_name=helm_model_name,
         pretrained_model_name_or_path=path,
+        openvino=openvino,
     )
diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py
@@ -264,6 +264,13 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
+    parser.add_argument(
+        "--openvino",
+        action="store_true",
+        default=False,
+        help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
+        "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
+    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
@@ -275,12 +282,19 @@ def main():
         from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
 
         for huggingface_model_name in args.enable_huggingface_models:
-            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+            if args.openvino:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
+            else:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+
     if args.enable_local_huggingface_models:
         from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
 
         for huggingface_model_path in args.enable_local_huggingface_models:
-            register_huggingface_local_model_from_flag_value(huggingface_model_path)
+            if args.openvino:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
+            else:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path)
 
     run_entries: List[RunEntry] = []
     if args.conf_paths:

diff --git a/src/helm/clients/huggingface_client.py b/src/helm/clients/huggingface_client.py
@@ -53,17 +53,42 @@ class HuggingFaceRequest(TypedDict):
 class HuggingFaceServer:
     """A thin wrapper around a Hugging Face AutoModelForCausalLM for HuggingFaceClient to call."""
 
-    def __init__(self, pretrained_model_name_or_path: str, **kwargs):
+    def __init__(self, pretrained_model_name_or_path: str, openvino=False, **kwargs):
         if torch.cuda.is_available():
             hlog("CUDA is available, initializing with a GPU...")
             self.device: str = "cuda:0"
         else:
             self.device = "cpu"
         with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
             # WARNING this may fail if your GPU does not have enough memory
-            self.model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_name_or_path, trust_remote_code=True, **kwargs
-            ).to(self.device)
+            if openvino:
+                """
+                Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
+                OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
+                Intel® architectures using OpenVINO™ runtime.
+                """
+                from pathlib import Path
+                from helm.common.optional_dependencies import handle_module_not_found_error
+
+                try:
+                    from optimum.intel.openvino import OVModelForCausalLM
+                except ModuleNotFoundError as e:
+                    handle_module_not_found_error(e, ["openvino"])
+
+                model_file = Path(pretrained_model_name_or_path) / "openvino_model.xml"
+                if model_file.exists():
+                    export = False
+                else:
+                    export = True
+
+                self.device = "cpu"
+                self.model = OVModelForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, export=export, trust_remote_code=True, **kwargs
+                ).to(self.device)
+            else:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=True, **kwargs
+                ).to(self.device)
         with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"):
             self.wrapped_tokenizer: WrappedPreTrainedTokenizer = HuggingFaceTokenizer.create_tokenizer(
                 pretrained_model_name_or_path, **kwargs