microsoft · jywu-msft · Sep 14, 2020 · Aug 11, 2020 · Sep 9, 2020 · Sep 9, 2020
diff --git a/docs/execution_providers/OpenVINO-ExecutionProvider.md b/docs/execution_providers/OpenVINO-ExecutionProvider.md
@@ -2,37 +2,91 @@
 
 OpenVINO Execution Provider enables deep learning inference on Intel CPUs, Intel integrated GPUs and Intel<sup>®</sup> Movidius<sup>TM</sup> Vision Processing Units (VPUs). Please refer to [this](https://software.intel.com/en-us/openvino-toolkit/hardware) page for details on the Intel hardware supported.
 
-## Build
+### Build
 For build instructions, please see the [BUILD page](../../BUILD.md#openvino).
 
+## Runtime configuration options
+---
+
 ## Onnxruntime Graph Optimization level
 OpenVINO backend performs both hardware dependent as well as independent optimizations to the graph to infer it with on the target hardware with best possible performance. In most of the cases it has been observed that passing in the graph from the input model as is would lead to best possible optimizations by OpenVINO. For this reason, it is advised to turn off high level optimizations performed by ONNX Runtime before handing the graph over to OpenVINO backend. This can be done using Session options as shown below:-
 
-1. Python API
+### Python API
 ```
 options = onnxruntime.SessionOptions()
 options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
 sess = onnxruntime.InferenceSession(<path_to_model_file>, options)
 ```
 
-2. C++ API
+### C/C++ API
 ```
 SessionOptions::SetGraphOptimizationLevel(ORT_DISABLE_ALL);
 ```
 
-## Dynamic device selection
+## Dynamic device type selection
 When ONNX Runtime is built with OpenVINO Execution Provider, a target hardware option needs to be provided. This build time option becomes the default target harware the EP schedules inference on. However, this target may be overriden at runtime to schedule inference on a different hardware as shown below.
 
 Note. This dynamic hardware selection is optional. The EP falls back to the build-time default selection if no dynamic hardware option value is specified.
-1. Python API
+
+### Python API
 ```
 import onnxruntime
 onnxruntime.capi._pybind_state.set_openvino_device("<harware_option>")
 # Create session after this
 ```
-2. C/C++ API
+*This property persists and gets applied to new sessions until it is explicity unset. To unset, assign a null string ("").*
+
+### C/C++ API
+
+Pass the device string as the **second** argument of the call to append OpenVINO EP.
+```
+Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_OpenVINO(sf, "<hardware_option>", ... ));
+```
+
+## Enabling VPU Fast-compile
+When scheduling inference on VPU, Fast-compile may be option that speeds up the model's compilation to VPU device specific format which speeds up model initialization time. However, enabling this option may slowdown inference due to some of the optimizations not being fully applied, so caution is to be exercised while enabling this.
+
+### Python API
+```
+import onnxruntime
+onnxruntime.capi._pybind_state.set_vpu_fast_compile(True)
+# Create session after this
+```
+*This property persists and gets applied to new sessions until it is explicity unset. To unset, assign a value False to the property.*
+
+### C/C++ API
+
+Pass the boolen value as the **third** argument of the call to append OpenVINO EP.
+```
+Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_OpenVINO(sf, "<hardware_option>", true));
+```
+
+
+## Dynamic device id selection
+After a device type is selected is selected for inference (either during build time or run time), a specific physical hardware device of the type that is available on the host may optionally be specified for inference.
+
+Note. This dynamic hardware selection is optional. The EP falls back to the build-time default selection if no dynamic hardware option value is specified.
+
+### Python API
+
+Get list of available OpenVINO compatible devices
+```
+import onnxruntime
+onnxruntime.capi._pybind_state.get_available_openvino_device_ids()
+```
+
+Select a device ID from the list obtained above
+```
+onnxruntime.capi._pybind_state.set_openvino_device_id("<device_id>")
+# Create session after this
+```
+*This property persists and gets applied to new sessions until it is explicity unset. To unset, assign a null string ("").*
+
+### C/C++ API
+
+Pass the device string as the **third** argument of the call to append OpenVINO EP.
 ```
-Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_OpenVINO(sf, "<hardware_option>"));
+Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_OpenVINO(sf, "<hardware_option>", <vpu_fast_compile>, <device_id> ));
 ```
 
 ## ONNX Layers supported using OpenVINO

diff --git a/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h b/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h
@@ -5,13 +5,16 @@
 
 #ifdef __cplusplus
 extern "C" {
+#else
+#include <stdbool.h>
 #endif
 
 /**
  * \param device_id openvino device id, starts from zero.
  */
 ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_OpenVINO,
-    _In_ OrtSessionOptions* options, const char* device_id);
+    _In_ OrtSessionOptions* options, const char* device_type,
+    bool enable_vpu_fast_compile, const char* device_id);
 
 #ifdef __cplusplus
 }

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -21,10 +21,8 @@ GlobalContext& BackendManager::GetGlobalContext() {
   return global_context;
 }
 
-BackendManager::BackendManager(const onnxruntime::Node* fused_node, const logging::Logger& logger,
-                               std::string dev_id, std::string prec_str) {
-  subgraph_context_.device_id = dev_id;
-  subgraph_context_.precision_str = prec_str;
+BackendManager::BackendManager(const onnxruntime::Node* fused_node, const logging::Logger& logger) {
+  auto prec_str = GetGlobalContext().precision_str; 
   if (prec_str == "FP32") {
     subgraph_context_.precision = InferenceEngine::Precision::FP32;
   } else if (prec_str == "FP16") {
@@ -51,7 +49,7 @@ BackendManager::BackendManager(const onnxruntime::Node* fused_node, const loggin
 
   auto graph_inputs = fused_node->GetFunctionBody()->Body().GetInputs();
   for (auto input : graph_inputs) {
-    if(subgraph_context_.device_id == "MYRIAD"){
+    if(GetGlobalContext().device_type == "MYRIAD"){
       auto shape = input->Shape();
       if(shape != nullptr){
         if(shape->dim_size() != 4){
@@ -81,7 +79,7 @@ BackendManager::BackendManager(const onnxruntime::Node* fused_node, const loggin
 
   if (ModelHasBatchedInputs(model_proto_) &&
       GetGlobalContext().is_wholly_supported_graph &&
-      subgraph_context_.device_id == "HDDL") {
+      GetGlobalContext().device_type == "HDDL") {
     subgraph_context_.enable_batching = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model can be Batch inferenced \n";
     auto model_copy = ReWriteBatchDimWithOne(model_proto_);
@@ -212,9 +210,9 @@ std::vector<std::vector<int64_t>> GetInputTensorShapes(Ort::CustomOpApi& api,
 }
 
 std::string MakeMapKeyString(std::vector<std::vector<int64_t>>& shapes,
-                             std::string& device_id) {
+                             std::string& device_type) {
   std::string key;
-  key += device_id;
+  key += device_type;
   key += "|";  //separator
   for (auto shape : shapes) {
     for (auto dim : shape) {
@@ -267,9 +265,9 @@ BackendManager::ReWriteBatchDimWithOne(const ONNX_NAMESPACE::ModelProto& model_p
 void BackendManager::Compute(Ort::CustomOpApi api, OrtKernelContext* context) {
   if (subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(api, context);
-    auto key = MakeMapKeyString(tensor_shapes, subgraph_context_.device_id);
+    auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
 
-    if(subgraph_context_.device_id == "MYRIAD"){
+    if(GetGlobalContext().device_type == "MYRIAD"){
 
       #if (defined OPENVINO_2020_2) || (defined OPENVINO_2020_3)
       for(size_t i = 0; i < subgraph_context_.input_indexes.size(); i++){

diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -20,8 +20,7 @@ namespace openvino_ep {
 // Singleton class that manages all the backends
 class BackendManager {
  public:
-  BackendManager(const onnxruntime::Node* fused_node, const logging::Logger& logger,
-                 std::string dev_id, std::string prec_str);
+  BackendManager(const onnxruntime::Node* fused_node, const logging::Logger& logger);
   void Compute(Ort::CustomOpApi api, OrtKernelContext* context);
   void ShutdownBackendManager();
   static GlobalContext& GetGlobalContext();

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -42,16 +42,13 @@ void DumpOnnxModelProto(const ONNX_NAMESPACE::ModelProto& model_proto, std::stri
 #endif
 
 std::shared_ptr<InferenceEngine::CNNNetwork>
-CreateCNNNetwork(const ONNX_NAMESPACE::ModelProto& model_proto, const SubGraphContext& subgraph_context, std::map<std::string, std::shared_ptr<ngraph::Node>>& const_outputs_map) {
+CreateCNNNetwork(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, const SubGraphContext& subgraph_context, std::map<std::string, std::shared_ptr<ngraph::Node>>& const_outputs_map) {
 
 
 #if (defined OPENVINO_2020_2) || (defined OPENVINO_2020_3)
   ORT_UNUSED_PARAMETER(const_outputs_map);
 #endif
 
-  InferenceEngine::Precision precision = subgraph_context.precision;
-  std::string device_id = subgraph_context.device_id;
-
   std::istringstream model_stream{model_proto.SerializeAsString()};
   std::shared_ptr<ngraph::Function> ng_function;
 
@@ -70,7 +67,8 @@ CreateCNNNetwork(const ONNX_NAMESPACE::ModelProto& model_proto, const SubGraphCo
     ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while importing model to nGraph Func");
   }
 
-  if (device_id == "GPU" && precision == InferenceEngine::Precision::FP16) {
+  if (global_context.device_type == "GPU" &&
+       subgraph_context.precision == InferenceEngine::Precision::FP16) {
     //FP16 transformations
     ngraph::pass::ConvertFP32ToFP16().run_on_function(ng_function);
     ng_function->validate_nodes_and_infer_types();

diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -23,7 +23,7 @@ void SetIODefs(const ONNX_NAMESPACE::ModelProto& model_proto,
                std::map<std::string, std::shared_ptr<ngraph::Node>>& const_outputs_map);
 
 std::shared_ptr<InferenceEngine::CNNNetwork>
-CreateCNNNetwork(const ONNX_NAMESPACE::ModelProto& model_proto, const SubGraphContext& subgraph_context, std::map<std::string,
+CreateCNNNetwork(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, const SubGraphContext& subgraph_context, std::map<std::string,
                    std::shared_ptr<ngraph::Node>>& const_outputs_map);
 
 int GetFirstAvailableDevice(GlobalContext& global_context);

diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -16,7 +16,7 @@ std::shared_ptr<IBackend>
 BackendFactory::MakeBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
                             GlobalContext& global_context,
                             const SubGraphContext& subgraph_context) {
-  std::string type = subgraph_context.device_id;
+  std::string type = global_context.device_type;
   if (type == "CPU" || type == "GPU" || type == "MYRIAD" || type == "HETERO:FPGA,CPU") {
     return std::make_shared<BasicBackend>(model_proto, global_context, subgraph_context);
   } else if (type == "HDDL") {

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -36,7 +36,7 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
                            const SubGraphContext& subgraph_context)
     : global_context_(global_context), subgraph_context_(subgraph_context) {
 
-  ie_cnn_network_ = CreateCNNNetwork(model_proto, subgraph_context_, const_outputs_map_);
+  ie_cnn_network_ = CreateCNNNetwork(model_proto, global_context_, subgraph_context_, const_outputs_map_);
   SetIODefs(model_proto, ie_cnn_network_, subgraph_context_.output_names, const_outputs_map_);
   InferenceEngine::ExecutableNetwork exe_network;
 
@@ -49,11 +49,20 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
   if(subgraph_context_.is_constant)
     return;
   std::map<std::string, std::string> config;
-  if(subgraph_context_.device_id == "MYRIAD" && subgraph_context_.set_vpu_config){
-    config["VPU_DETECT_NETWORK_BATCH"] = CONFIG_VALUE(NO);
+  if(global_context_.device_type == "MYRIAD"){
+
+    if(subgraph_context_.set_vpu_config) {
+      config["VPU_DETECT_NETWORK_BATCH"] = CONFIG_VALUE(NO);
+    }
+
+    if(global_context_.enable_vpu_fast_compile) {
+      config["VPU_HW_INJECT_STAGES"] = CONFIG_VALUE(NO);
+      config["VPU_COPY_OPTIMIZATION"] = CONFIG_VALUE(NO);
+    }
   }
+  std::string& hw_target = (global_context_.device_id != "") ? global_context_.device_id : global_context_.device_type;
   try {
-    exe_network = global_context_.ie_core.LoadNetwork(*ie_cnn_network_, subgraph_context_.device_id, config);
+    exe_network = global_context_.ie_core.LoadNetwork(*ie_cnn_network_, hw_target, config);
   } catch (InferenceEngine::details::InferenceEngineException e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + subgraph_context_.subgraph_name + ": " +  e.what());
   } catch (...) {
@@ -228,4 +237,4 @@ void BasicBackend::Infer(Ort::CustomOpApi& ort, OrtKernelContext* context) {
 }
 
 }  // namespace openvino_ep
-}  // namespace onnxruntime
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backends/vadm_backend.cc b/onnxruntime/core/providers/openvino/backends/vadm_backend.cc
@@ -47,7 +47,7 @@ VADMBackend::VADMBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
   // sets number of maximum parallel inferences
   num_inf_reqs_ = 8;
 
-  ie_cnn_network_ = CreateCNNNetwork(model_proto, subgraph_context_, const_outputs_map_);
+  ie_cnn_network_ = CreateCNNNetwork(model_proto, global_context_, subgraph_context_, const_outputs_map_);
 
   SetIODefs(model_proto, ie_cnn_network_, subgraph_context_.output_names, const_outputs_map_);
   std::map<std::string, std::string> config;

diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
@@ -12,6 +12,10 @@ namespace openvino_ep {
 struct GlobalContext {
   InferenceEngine::Core ie_core;
   bool is_wholly_supported_graph = false;
+  bool enable_vpu_fast_compile = false;
+  std::string device_type;
+  std::string precision_str;
+  std::string device_id;
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::vector<std::string> deviceTags = {"0", "1", "2", "3", "4", "5", "6", "7"};
 };
@@ -29,9 +33,7 @@ struct SubGraphContext {
   std::unordered_map<std::string, int> input_names;
   #endif
   std::unordered_map<std::string, int> output_names;
-  std::string device_id;
   InferenceEngine::Precision precision;
-  std::string precision_str;
 };
 
 }  // namespace openvino_ep

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -19,7 +19,30 @@ namespace onnxruntime {
 constexpr const char* OpenVINO = "OpenVINO";
 
 OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider}, info_(info) {
+    : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
+
+  openvino_ep::BackendManager::GetGlobalContext().device_type = info.device_type_;
+  openvino_ep::BackendManager::GetGlobalContext().precision_str = info.precision_;
+  openvino_ep::BackendManager::GetGlobalContext().enable_vpu_fast_compile = info.enable_vpu_fast_compile_;
+  if(info.device_id_ != "") {
+    bool device_found = false;
+    auto available_devices = openvino_ep::BackendManager::GetGlobalContext().ie_core.GetAvailableDevices();
+    for(auto device : available_devices) {
+      if(device == info.device_id_) {
+        device_found = true;
+        break;
+      }
+    }
+    if(!device_found) {
+      std::string err_msg = std::string("Device not found : ") + info.device_id_ + "\nChoose one of:\n";
+      for(auto device : available_devices) {
+        err_msg = err_msg + device + "\n";
+      }
+      ORT_THROW(err_msg);
+    }
+  }
+  openvino_ep::BackendManager::GetGlobalContext().device_id = info.device_id_;
+
   AllocatorCreationInfo device_info(
       [](int) {
         return std::make_unique<CPUAllocator>(OrtMemoryInfo(OpenVINO, OrtDeviceAllocator));
@@ -36,9 +59,11 @@ OpenVINOExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_v
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
 #if (defined OPENVINO_2020_2) || (defined OPENVINO_2020_3)
-  result = openvino_ep::GetCapability_2020_2(graph_viewer, info_.device_id_);
+  result = openvino_ep::GetCapability_2020_2(graph_viewer,
+                          openvino_ep::BackendManager::GetGlobalContext().device_type);
 #elif defined OPENVINO_2020_4
-  result = openvino_ep::GetCapability_2020_4(graph_viewer, info_.device_id_);
+  result = openvino_ep::GetCapability_2020_4(graph_viewer,
+                          openvino_ep::BackendManager::GetGlobalContext().device_type);
 #endif
 
   return result;
@@ -49,7 +74,7 @@ common::Status OpenVINOExecutionProvider::Compile(
     std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (const auto& fused_node : fused_nodes) {
     NodeComputeInfo compute_info;
-    std::shared_ptr<openvino_ep::BackendManager> backend_manager = std::make_shared<openvino_ep::BackendManager>(fused_node, *GetLogger(), info_.device_id_, info_.precision_);
+    std::shared_ptr<openvino_ep::BackendManager> backend_manager = std::make_shared<openvino_ep::BackendManager>(fused_node, *GetLogger());
 
     compute_info.create_state_func =
         [backend_manager](ComputeContext* context, FunctionState* state) {