diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 5159b0da45170..0845ff2d4c8a2 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -4804,3 +4804,20 @@ INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
 OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 PERFORMANCE OF THIS SOFTWARE.
+
+-----
+
+MurmurHash3
+
+MIT license
+
+https://github.com/aappleby/smhasher
+
+SMHasher is a test suite designed to test the distribution, collision, and 
+performance properties of non-cryptographic hash functions.
+This is the home for the MurmurHash family of hash functions along with the 
+SMHasher test suite used to verify them. 
+SMHasher is released under the MIT license. 
+All MurmurHash versions are public domain software, and the author disclaims all copyright to their code.
+
+-----
\ No newline at end of file
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 382a9fdd5b750..4162e55119fbe 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -457,6 +457,16 @@
             },
             "comments": "Installed in the training docker image"
          }
+      },
+      {
+         "component": {
+            "type": "git",
+            "git": {
+               "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
+               "repositoryUrl": "https://github.com/aappleby/smhasher"
+            },
+            "comments": "MurmurHash3"
+         }
       }
    ],
    "Version": 1
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index c93b3c4645182..489394acea548 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -84,9 +84,6 @@ option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
 
 #It's preferred to turn it OFF when onnxruntime is dynamically linked to PROTOBUF
 option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
-option(onnxruntime_DISABLE_CONTRIB_OPS "Disable contrib ops" OFF)
-option(onnxruntime_DISABLE_ML_OPS "Disable traditional ML ops" OFF)
-option(onnxruntime_DISABLE_RTTI "Disable RTTI" OFF)
 option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
 option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
 option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
@@ -109,11 +106,14 @@ option(onnxruntime_USE_TELEMETRY "Build with Telemetry" OFF)
 #It's default OFF because it's experimental now.
 option(onnxruntime_PREFER_SYSTEM_LIB "Experimental: Build with the preinstalled libraries in your system" OFF)
 
-# ORT mininal build settings
+# Options to reduce build size
+option(onnxruntime_DISABLE_CONTRIB_OPS "Disable contrib ops" OFF)
+option(onnxruntime_DISABLE_ML_OPS "Disable traditional ML ops" OFF)
+option(onnxruntime_DISABLE_RTTI "Disable RTTI" OFF)
+# For now onnxruntime_DISABLE_EXCEPTIONS will only work with onnxruntime_MINIMAL_BUILD, more changes (ONNX, non-CPU EP, ...) are required to run this standalone
+option(onnxruntime_DISABLE_EXCEPTIONS "Disable exception handling. Requires onnxruntime_MINIMAL_BUILD currently." OFF)
 option(onnxruntime_MINIMAL_BUILD "Exclude as much as possible from the build. Support ORT format models. No support for ONNX format models." OFF)
 option(onnxruntime_REDUCED_OPS_BUILD "Reduced set of kernels are registered in build via modification of the kernel registration source files." OFF)
-# For now onnxruntime_NO_EXCEPTIONS will only work with onnxruntime_MINIMAL_BUILD, more changes (ONNX, non-CPU EP, ...) are required to run this standalone
-option(onnxruntime_NO_EXCEPTIONS "Disable exception handling." OFF)
 
 # training options
 option(onnxruntime_ENABLE_NVTX_PROFILE "Enable NVTX profile." OFF)
@@ -214,15 +214,18 @@ endif()
 # Will expose option in build.py when all pieces are available
 if(onnxruntime_MINIMAL_BUILD)
   add_compile_definitions(ORT_MINIMAL_BUILD)
-  set(onnxruntime_REDUCED_OPS_BUILD ON)  # TODO Defaulting to ON. TBD if we should always do that.
+  set(onnxruntime_REDUCED_OPS_BUILD ON)
   set(onnxruntime_DISABLE_RTTI ON)
 
   if (MSVC)
-    # add MSVC specific flags to reduce build size here
+    # add MSVC specific flags to reduce build size here if needed
   else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections")
-    # TODO: May need to use -dead_strip instead of --gc-sections for iOS(XCode)
-    add_link_options(-Wl,--gc-sections)
+    if (CMAKE_HOST_SYSTEM MATCHES "Darwin")
+      add_link_options(-Wl,-dead_strip)
+    else()
+      add_link_options(-Wl,--gc-sections)
+    endif()  
   endif()
 
 endif()
@@ -243,9 +246,9 @@ endif()
 
 # If this is only enabled in an onnxruntime_ORT_MODEL_FORMAT_ONLY build we don't need ONNX changes
 # as we (currently) only pull in data_type_utils.cc/h which doesn't throw
-if(onnxruntime_NO_EXCEPTIONS)
+if(onnxruntime_DISABLE_EXCEPTIONS)
   if(NOT onnxruntime_MINIMAL_BUILD)
-    message(FATAL_ERROR "onnxruntime_MINIMAL_BUILD required for onnxruntime_NO_EXCEPTIONS")
+    message(FATAL_ERROR "onnxruntime_MINIMAL_BUILD required for onnxruntime_DISABLE_EXCEPTIONS")
   endif()
 
   add_compile_definitions("ORT_NO_EXCEPTIONS")
@@ -259,7 +262,7 @@ if(onnxruntime_NO_EXCEPTIONS)
     string(APPEND CMAKE_CXX_FLAGS " /wd4834 /wd4702")
     add_compile_definitions("_HAS_EXCEPTIONS=0")
   else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-unwind-tables -fno-asynchronous-unwind-tables")
   endif()
 endif()
 
@@ -319,13 +322,13 @@ if (MSVC)
     set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "Link protobuf to static runtime libraries" FORCE)
     set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib for gtest" FORCE)
   endif()
+
   #Always enable exception handling, even for Windows ARM
-  if(NOT onnxruntime_NO_EXCEPTIONS)
+  if(NOT onnxruntime_DISABLE_EXCEPTIONS)
     string(APPEND CMAKE_CXX_FLAGS " /EHsc /wd26812")
     string(APPEND CMAKE_C_FLAGS " /EHsc /wd26812")
   endif()
-  string(APPEND CMAKE_CXX_FLAGS " /EHsc /wd26812")
-  string(APPEND CMAKE_C_FLAGS " /EHsc /wd26812")
+
   if(onnxruntime_USE_AVX)
     string(APPEND CMAKE_CXX_FLAGS " /arch:AVX")
     string(APPEND CMAKE_C_FLAGS " /arch:AVX")
@@ -336,10 +339,12 @@ if (MSVC)
     string(APPEND CMAKE_CXX_FLAGS " /arch:AVX512")
     string(APPEND CMAKE_C_FLAGS " /arch:AVX512")
   endif()
+
   if (onnxruntime_ENABLE_LTO AND NOT onnxruntime_USE_CUDA)
     SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
     SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
   endif()
+  
   # The WinML build tool chain builds ARM/ARM64, and the internal tool chain does not have folders for spectre mitigation libs.
   # WinML performs spectre mitigation differently.
   if (NOT DEFINED onnxruntime_DISABLE_QSPECTRE_CHECK)
@@ -387,9 +392,9 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "iOSCross")
     #For ios compliance
     message("Adding flags for ios builds")
     if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -target arm64-apple-darwin-macho")
+      set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -target arm64-apple-darwin-macho")
     elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "arm")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -target armv7a-apple-darwin-macho")
+      set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -target armv7a-apple-darwin-macho")
     endif()
 endif()
 
@@ -509,6 +514,7 @@ if(UNIX AND onnxruntime_ENABLE_LTO AND NOT onnxruntime_PREFER_SYSTEM_LIB)
   #https://github.com/protocolbuffers/protobuf/issues/5923
   target_link_options(protoc PRIVATE "-Wl,--no-as-needed")
 endif()
+
 include(protobuf_function.cmake)
 #protobuf end
 
@@ -571,8 +577,8 @@ endif()
 # we make it through via a handler so CUDA does not complain
 # The following -DGSL macros are recognized by gsl-lite along with -Dgsl macros
 # no bounds checking in release build so no perf cost
-# if we enable onnxruntime_NO_EXCEPTIONS, gsl will terminate
-if (onnxruntime_NO_EXCEPTIONS)
+# if we enable onnxruntime_DISABLE_EXCEPTIONS, gsl will terminate
+if (onnxruntime_DISABLE_EXCEPTIONS)
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DGSL_TERMINATE_ON_CONTRACT_VIOLATION")
 else()
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DGSL_THROW_ON_CONTRACT_VIOLATION")
@@ -703,7 +709,13 @@ if(NOT onnxruntime_USE_FULL_PROTOBUF)
 else()
   set(ONNX_USE_LITE_PROTO OFF CACHE BOOL "" FORCE)
 endif()
-add_subdirectory(external/onnx)
+
+if (NOT onnxruntime_MINIMAL_BUILD)
+  add_subdirectory(external/onnx)
+else()
+  include(onnx_minimal)
+endif()
+
 target_compile_definitions(onnx PUBLIC $<TARGET_PROPERTY:onnx_proto,INTERFACE_COMPILE_DEFINITIONS> PRIVATE "__ONNX_DISABLE_STATIC_REGISTRATION")
 if (NOT onnxruntime_USE_FULL_PROTOBUF)
   target_compile_definitions(onnx PUBLIC "__ONNX_NO_DOC_STRINGS")
diff --git a/cmake/external/onnx_minimal.cmake b/cmake/external/onnx_minimal.cmake
new file mode 100644
index 0000000000000..d99ec9a884a4b
--- /dev/null
+++ b/cmake/external/onnx_minimal.cmake
@@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+#
+# Setup onnx and onnx_protobuf for a build with onnxruntime_MINIMAL_BUILD enabled.
+# We exclude everything but the essentials from the onnx library.
+#
+
+if(NOT onnxruntime_MINIMAL_BUILD)
+  message(FATAL_ERROR "This file should only be included in a minimal build")
+endif()
+
+#TODO: if protobuf is a shared lib and onnxruntime_USE_FULL_PROTOBUF is ON, then onnx_proto should be built as a shared lib instead of a static lib. Otherwise any code outside onnxruntime.dll can't use onnx protobuf definitions if they share the protobuf.dll with onnxruntime. For example, if protobuf is a shared lib and onnx_proto is a static lib then onnxruntime_perf_test won't work.
+
+set(ONNX_SOURCE_ROOT ${PROJECT_SOURCE_DIR}/external/onnx)
+
+add_library(onnx_proto ${ONNX_SOURCE_ROOT}/onnx/onnx-ml.proto ${ONNX_SOURCE_ROOT}/onnx/onnx-operators-ml.proto)
+
+target_include_directories(onnx_proto PUBLIC $<TARGET_PROPERTY:protobuf::libprotobuf,INTERFACE_INCLUDE_DIRECTORIES> "${CMAKE_CURRENT_BINARY_DIR}")
+target_compile_definitions(onnx_proto PUBLIC $<TARGET_PROPERTY:protobuf::libprotobuf,INTERFACE_COMPILE_DEFINITIONS>)
+
+set(_src_prefix "onnx/")
+onnxruntime_protobuf_generate(NO_SRC_INCLUDES GEN_SRC_PREFIX ${_src_prefix} IMPORT_DIRS ${ONNX_SOURCE_ROOT} TARGET onnx_proto)
+
+if (WIN32)
+  target_compile_options(onnx_proto PRIVATE "/wd4146" "/wd4125" "/wd4456" "/wd4267" "/wd4309")
+else()
+  if(HAS_UNUSED_VARIABLE)
+    target_compile_options(onnx_proto PRIVATE "-Wno-unused-variable")
+  endif()
+
+  if(HAS_UNUSED_BUT_SET_VARIABLE)
+    target_compile_options(onnx_proto PRIVATE "-Wno-unused-but-set-variable")
+  endif()   
+endif()
+
+# For reference, this would be the full ONNX source include. We only need data_type_utils.* in this build.
+# file(GLOB_RECURSE onnx_src CONFIGURE_DEPENDS
+#     "${ONNX_SOURCE_ROOT}/onnx/*.h"
+#     "${ONNX_SOURCE_ROOT}/onnx/*.cc"
+# )
+# file(GLOB_RECURSE onnx_exclude_src CONFIGURE_DEPENDS
+#     "${ONNX_SOURCE_ROOT}/onnx/py_utils.h"
+#     "${ONNX_SOURCE_ROOT}/onnx/proto_utils.h"
+#     "${ONNX_SOURCE_ROOT}/onnx/backend/test/cpp/*"
+#     "${ONNX_SOURCE_ROOT}/onnx/test/*"
+#     "${ONNX_SOURCE_ROOT}/onnx/cpp2py_export.cc"
+# )
+# list(REMOVE_ITEM onnx_src ${onnx_exclude_src})  
+file(GLOB onnx_src CONFIGURE_DEPENDS
+"${ONNX_SOURCE_ROOT}/onnx/defs/data_type_utils.*"
+)
+
+if (MSVC)
+  SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
+  SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
+endif()
+
+add_library(onnx ${onnx_src})
+add_dependencies(onnx onnx_proto)
+target_include_directories(onnx PUBLIC "${ONNX_SOURCE_ROOT}")
+target_include_directories(onnx PUBLIC $<TARGET_PROPERTY:onnx_proto,INTERFACE_INCLUDE_DIRECTORIES>)
+if (onnxruntime_USE_FULL_PROTOBUF)
+  target_compile_definitions(onnx PUBLIC "ONNX_ML" "ONNX_NAMESPACE=onnx")
+else()
+  target_compile_definitions(onnx PUBLIC "ONNX_ML" "ONNX_NAMESPACE=onnx" "ONNX_USE_LITE_PROTO")
+endif()
+
+if (WIN32)
+    target_compile_options(onnx PRIVATE
+        /wd4800 # 'type' : forcing value to bool 'true' or 'false' (performance warning)
+        /wd4125 # decimal digit terminates octal escape sequence
+        /wd4100 # 'param' : unreferenced formal parameter
+        /wd4244 # 'argument' conversion from 'google::protobuf::int64' to 'int', possible loss of data
+        /wd4996 # 'argument' Using double parameter version instead of single parameter version of SetTotalBytesLimit(). The second parameter is ignored.
+    )
+    if (NOT onnxruntime_DISABLE_EXCEPTIONS)
+      target_compile_options(onnx PRIVATE
+          /EHsc   # exception handling - C++ may throw, extern "C" will not
+      )
+    endif()
+    
+    target_compile_options(onnx_proto PRIVATE
+        /wd4244 # 'argument' conversion from 'google::protobuf::int64' to 'int', possible loss of data
+    )
+
+    set(onnx_static_library_flags
+        -IGNORE:4221 # LNK4221: This object file does not define any previously undefined public symbols, so it will not be used by any link operation that consumes this library
+    )
+    set_target_properties(onnx PROPERTIES
+        STATIC_LIBRARY_FLAGS "${onnx_static_library_flags}")
+else()
+  if(HAS_UNUSED_PARAMETER)
+    target_compile_options(onnx PRIVATE "-Wno-unused-parameter")
+  endif()
+  if(HAS_UNUSED_BUT_SET_VARIABLE)
+    target_compile_options(onnx PRIVATE "-Wno-unused-but-set-variable")
+  endif()
+endif()
+
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index efa613788a2ae..9cbb7c59287fa 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -105,17 +105,19 @@ if (onnxruntime_ENABLE_TRAINING)
 endif()
 
 if (WIN32)
-    set(onnxruntime_graph_static_library_flags
-        -IGNORE:4221 # LNK4221: This object file does not define any previously undefined public symbols, so it will not be used by any link operation that consumes this library
-    )
+  set(onnxruntime_graph_static_library_flags
+      -IGNORE:4221 # LNK4221: This object file does not define any previously undefined public symbols, so it will not be used by any link operation that consumes this library
+  )
 
-    set_target_properties(onnxruntime_graph PROPERTIES
-        STATIC_LIBRARY_FLAGS "${onnxruntime_graph_static_library_flags}")
+  set_target_properties(onnxruntime_graph PROPERTIES
+      STATIC_LIBRARY_FLAGS "${onnxruntime_graph_static_library_flags}")
 
+  if (NOT onnxruntime_DISABLE_EXCEPTIONS)  
     target_compile_options(onnxruntime_graph PRIVATE
         /EHsc   # exception handling - C++ may throw, extern "C" will not
     )
+  endif()
 
-    # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include.
-    set_target_properties(onnxruntime_graph PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
+  # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include.
+  set_target_properties(onnxruntime_graph PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
 endif()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 923f1c1ab3955..23f5f576b2de0 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -130,8 +130,11 @@ else()  # minimal and/or reduced ops build
     )
 
   if (onnxruntime_MINIMAL_BUILD)
-    # TODO: Add tests that can be used in a minimal build
-  else()
+    list(APPEND onnxruntime_test_framework_src_patterns 
+      "${TEST_SRC_DIR}/framework/ort_model_only_test.cc"
+    )
+
+  else() # reduced ops build
     file(GLOB onnxruntime_test_ir_src CONFIGURE_DEPENDS
       "${TEST_SRC_DIR}/ir/*.cc"
       "${TEST_SRC_DIR}/ir/*.h"
@@ -261,12 +264,16 @@ set (ONNXRUNTIME_API_TESTS_WITHOUT_ENV_SRC_DIR "${ONNXRUNTIME_ROOT}/test/api_tes
 
 set (onnxruntime_shared_lib_test_SRC
           ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_fixture.h
-          ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_inference.cc
           ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_session_options.cc
           ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_run_options.cc
           ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_allocator.cc
           ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_nontensor_types.cc
           ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_model_loading.cc)
+
+if (NOT onnxruntime_MINIMAL_BUILD)
+  list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_inference.cc)
+endif()
+
 if(onnxruntime_RUN_ONNX_TESTS)
   list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_io_types.cc)
 endif()
@@ -837,7 +844,7 @@ if (onnxruntime_BUILD_SHARED_LIB)
   )
 
   # test inference using global threadpools
-  if (NOT CMAKE_SYSTEM_NAME STREQUAL "Android")
+  if (NOT CMAKE_SYSTEM_NAME STREQUAL "Android" AND NOT onnxruntime_MINIMAL_BUILD)
   AddTest(DYN
           TARGET onnxruntime_global_thread_pools_test
           SOURCES ${onnxruntime_global_thread_pools_test_SRC}
diff --git a/cmake/protobuf_function.cmake b/cmake/protobuf_function.cmake
index d3c0b6e688c66..3cca9577efd65 100644
--- a/cmake/protobuf_function.cmake
+++ b/cmake/protobuf_function.cmake
@@ -42,8 +42,8 @@ function(onnxruntime_protobuf_generate)
     set(PROTOC_EXECUTABLE $<TARGET_FILE:protobuf::protoc>)
     set(PROTOC_DEPS protobuf::protoc)
   endif()
-  set(_options APPEND_PATH)
-  set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO)
+  set(_options APPEND_PATH NO_SRC_INCLUDES)
+  set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO GEN_SRC_PREFIX)
   if(COMMAND target_sources)
     list(APPEND _singleargs TARGET)
   endif()
@@ -95,18 +95,20 @@ function(onnxruntime_protobuf_generate)
     return()
   endif()
 
-  if(onnxruntime_protobuf_generate_APPEND_PATH)
-    # Create an include path for each file specified
-    foreach(_file ${onnxruntime_protobuf_generate_PROTOS})
-      get_filename_component(_abs_file ${_file} ABSOLUTE)
-      get_filename_component(_abs_path ${_abs_file} PATH)
-      list(FIND _protobuf_include_path ${_abs_path} _contains_already)
-      if(${_contains_already} EQUAL -1)
-        list(APPEND _protobuf_include_path -I ${_abs_path})
-      endif()
-    endforeach()
-  else()
-    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+  if (NOT onnxruntime_protobuf_generate_NO_SRC_INCLUDES)
+    if(onnxruntime_protobuf_generate_APPEND_PATH)
+      # Create an include path for each file specified
+      foreach(_file ${onnxruntime_protobuf_generate_PROTOS})
+        get_filename_component(_abs_file ${_file} ABSOLUTE)
+        get_filename_component(_abs_path ${_abs_file} PATH)
+        list(FIND _protobuf_include_path ${_abs_path} _contains_already)
+        if(${_contains_already} EQUAL -1)
+          list(APPEND _protobuf_include_path -I ${_abs_path})
+        endif()
+      endforeach()
+    else()
+      set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+    endif()
   endif()
 
   foreach(DIR ${onnxruntime_protobuf_generate_IMPORT_DIRS})
@@ -118,13 +120,20 @@ function(onnxruntime_protobuf_generate)
   endforeach()
 
   set(_generated_srcs_all)
+
+  if (onnxruntime_protobuf_generate_GEN_SRC_PREFIX)
+    set(_src_prefix "${onnxruntime_protobuf_generate_GEN_SRC_PREFIX}")
+  else()
+    set(_src_prefix "")
+  endif()
+
   foreach(_proto ${onnxruntime_protobuf_generate_PROTOS})
     get_filename_component(_abs_file ${_proto} ABSOLUTE)
     get_filename_component(_basename ${_proto} NAME_WE)
 
     set(_generated_srcs)
     foreach(_ext ${onnxruntime_protobuf_generate_EXTENSIONS})
-      list(APPEND _generated_srcs "${CMAKE_CURRENT_BINARY_DIR}/${_basename}${_ext}")
+      list(APPEND _generated_srcs "${CMAKE_CURRENT_BINARY_DIR}/${_src_prefix}${_basename}${_ext}")
     endforeach()
     list(APPEND _generated_srcs_all ${_generated_srcs})
 
diff --git a/docs/ONNX_Runtime_for_Mobile_Platforms.md b/docs/ONNX_Runtime_for_Mobile_Platforms.md
new file mode 100644
index 0000000000000..abd58ea49cba2
--- /dev/null
+++ b/docs/ONNX_Runtime_for_Mobile_Platforms.md
@@ -0,0 +1,135 @@
+# ONNX Runtime for Mobile Platforms
+
+## Overview
+
+ONNX Runtime now supports an internal model format to minimize the build size for usage in mobile and embedded scenarios. An ONNX model can be converted to an internal ONNX Runtime format ('ORT format model') using the below instructions.
+
+The minimal build can be used with any ORT format model, provided that the kernels for the operators used in the model were included in the build. 
+  i.e. the custom build provides a set of kernels, and if that set satisfies a given ORT format model's needs, the model can be loaded and executed. 
+
+## Steps to create model and minimal build
+
+You will need a script from the the ONNX Runtime repository, and to perform a custom build, so you will need to clone the repository locally. See [here](https://github.com/microsoft/onnxruntime/blob/master/BUILD.md#prerequisites) for initial steps.
+
+Perform the following steps to create a minimal build of ONNX Runtime that is model specific. 
+
+### 1. Create ORT format model
+
+We will use a helper python script to convert an existing ONNX format model into an ORT format model.
+This will require the ORT python package to be installed, and the ONNX Runtime repository to have been cloned. 
+The directory the ONNX Runtime repository is cloned into is referred to as `<ONNX Runtime repository root>` in this documentation.
+A single model is converted at a time by this script.
+
+  - Install the ONNX Runtime nightly python package from https://test.pypi.org/project/ort-nightly/
+    - e.g. `pip install -i https://test.pypi.org/simple/ ort-nightly`
+  - Convert the ONNX model to ORT format
+    - `python <ONNX Runtime repository root>/tools/python/convert_onnx_model_to_ort.py <path to .onnx model>`
+    - This script will first optimize the ONNX model and save it with a '.optimized.onnx' file extension
+      - *IMPORTANT* this optimized ONNX model should be used as the input to the minimal build. Do NOT use the original ONNX model for that step.
+    - It will next convert the optimized ONNX model to ORT format and save the file using '.ort' as the file extension.
+
+Example:
+
+Running `python <ORT repository root>/tools/python/convert_onnx_model_to_ort.py /models/ssd_mobilenet.onnx`
+  - Will create `/models/ssd_mobilenet.optimized.onnx`, which is an ONNX format model that ONNX Runtime has optimized 
+    - e.g. constant folding will have run
+  - Will use `/models/ssd_mobilenet.optimized.onnx` to create `/models/ssd_mobilenet.ort` 
+    - ssd_mobilenet.ort is the ORT format version of the optimized model. 
+
+
+### 2. Setup information to reduce build to minimum set of operator kernels required
+
+In order to reduce the operator kernels included in the build, the required set must be either inferred from one or more ONNX models, or explicitly specified via configuration.
+
+To infer, put one or more optimized ONNX models in a directory. The directory will be recursively searched for '.onnx' files. 
+If taking this approach (vs creating a configuration file), you should only include the optimized ONNX models and not both the original and optimized models, as there may be kernels that are were required in the original model that are not required in the optimized model.
+
+Alternatively a configuration file can be created to specify the set of kernels to include. 
+
+See the documentation on the [Reduced Operator Kernel build](Reduced_Operator_Kernel_build.md) for more information. 
+
+This step can be run prior to building, or as part of the minimal build.
+
+#### Example usage:
+
+##### Pre-build
+
+Place the optimized ONNX model/s (files with '.optimized.onnx' from the 'Create ORT format model' step above) in a directory. 
+
+Run the script to exclude unused kernels using this directory.
+
+`python <ONNX Runtime repository root>/tools/ci_build/exclude_unused_ops.py --model_path <directory with model/s>`
+
+##### When building
+
+When building as per the below instructions, add `--include_ops_by_model <directory with model/s>` to the build command.
+
+
+### 3. Create the minimal build
+
+You will need to build ONNX Runtime from source to reduce the included operator kernels and other aspects of the binary. 
+
+See [here](https://github.com/microsoft/onnxruntime/blob/master/BUILD.md#start-baseline-cpu) for build instructions. 
+
+Binary size reduction options.
+  - Enable minimal build (`--minimal_build`)
+    - A minimal build will ONLY support loading and executing ORT format models. RTTI is disabled by default in this build.
+
+  - Disable exceptions (`--disable_exceptions`)
+    - Disables exceptions in the build. Any locations that would have thrown an exception will instead log the error message and call abort(). 
+        - Requires `--minimal_build`
+        - Is not a valid option if you need the python bindings (`--build_wheel`) as python/pybind cannot be built with exceptions disabled.
+    - Exceptions are only used in ORT for exceptional things. If you have validated the input to be used, and validated that the model can be loaded, it is unlikely that ORT would throw an exception unless there's a system level issue (e.g. out of memory). 
+
+  - ML op support (`--disable_ml_ops`)
+    - Whilst the operator kernel reduction script will disable all unused ML operator kernels, additional savings can be achieved by removing support for ML specific types. If you know your model has no ML ops, or no ML ops that use the Map type, this flag can be provided. 
+    - See the specs for the [ONNX ML Operators](https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md) if unsure.
+
+
+#### Example build commands
+
+##### Windows
+
+`<ONNX Runtime repository root>\build.bat --config=MinSizeRel --cmake_generator="Visual Studio 16 2019" --build_shared_lib --minimal_build --disable_ml_ops --disable_exceptions`
+
+##### Linux
+
+`<ONNX Runtime repository root>/build.sh --config=MinSizeRel --build_shared_lib --minimal_build --disable_ml_ops --disable_exceptions`
+
+## Executing ORT format models
+
+The API for executing ORT format models is the same as for ONNX models. See the [ORT API documentation](https://github.com/Microsoft/onnxruntime/#api-documentation).
+
+If you provide a filename for the ORT format model, a file extension of '.ort' will be inferred to be an ORT format model.
+If you provide in-memory bytes for the ORT format model, a marker in those bytes will be checked to infer if it's an ORT format model.
+
+If you wish to explicitly say that the InferenceSession input is an ORT format model you can do so via SessionOptions.
+
+
+C++ API
+```C++
+Ort::SessionOptions session_options;
+session_options.AddConfigEntry('session.load_model_format', 'ORT');
+```
+
+Python
+```python
+so = onnxruntime.SessionOptions()
+so.add_session_config_entry('session.load_model_format', 'ORT')
+session = onnxruntime.InferenceSession(<path to model>, so)
+```
+
+## Limitations
+
+A minimal build has the following limitations
+  - No support for ONNX format models
+    - Model must be converted to ORT format
+  - No support for runtime optimizations
+    - Optimizations should be performed prior to conversion to ORT format
+  - No support for runtime partioning
+    - Execution providers that will be used at runtime must be enabled when creating the ORT format model
+  - Only supports execution providers that have statically registered kernels
+    - e.g. ORT CPU and CUDA execution providers
+    - Execution providers that dynamically compile nodes in the graph into custom kernels at runtime are not supported
+  - No support for custom operators
+
diff --git a/docs/ReduceBinarySize.md b/docs/ReduceBinarySize.md
deleted file mode 100644
index 341641a70c71f..0000000000000
--- a/docs/ReduceBinarySize.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Reduce binary size
-To reduce compiled binary size, two options are available:
-
-- --include_ops_by_model=<path to directory of models\>
-- --include_ops_by_file=<path to a file\>
-
-The options empower building to comment out operators listed in execution provider(s), thereby downsizing the output.
-Note that it is a MUST to build with --skip_tests in case excluded ops cause test failures.
-
-## include_ops_by_model
-The argument enables the compile binary of including only operators consumed by models in the specified directory.
-
-## include_ops_by_file
-The argument enables the compiled binary of including only operators referred. The file has format like:
-```
-#domain;opset;op1,op2...
-ai.onnx;1;MemcpyToHost,MemcpyFromHost
-ai.onnx;11;Gemm
-```
-
-## Usage tips
-- By default, the trimming happens only on cpu execution provider, with --use_cuda it will also be applied to cuda;
-- If both are specified, operators referred from either argument will be kept active;
-- The script is located under toos/ci_build/, and could go solo to apply to cpu and cuda providers as:
-```
-python exclude_unused_ops.py --model_path d:\ReduceSize\models --file_path d:\ReduceSize\ops.txt --ort_root d:\onnxruntime
-```
\ No newline at end of file
diff --git a/docs/Reduced_Operator_Kernel_build.md b/docs/Reduced_Operator_Kernel_build.md
new file mode 100644
index 0000000000000..fe1fc7a90e08d
--- /dev/null
+++ b/docs/Reduced_Operator_Kernel_build.md
@@ -0,0 +1,71 @@
+# ONNX Runtime Reduced Operator Kernel build
+
+In order to reduce the compiled binary size of ONNX Runtime (ORT), the operator kernels included in the build can be reduced to just the kernels required for your scenario. 
+
+The kernels to include must first be identified, and secondly the ORT kernel registration source files must be updated to exclude the unused kernels. 
+
+Finally ORT must be manually built.
+
+When building ORT with a reduced set of kernel registrations, `--skip_tests` *MUST* be specified as the kernel reduction will render many of the unit tests invalid. 
+
+## Selecting Required Kernels
+
+Two options are available for selecting the required operator kernels. These options can be combined.
+
+### Selection via ONNX models
+
+Put the ONNX model/s you wish to be able to execute with a reduced version of ORT in a directory. The selection script will recursively look for all '.onnx' models in this directory, and aggregate information on the kernels required. 
+
+### Selection via configuration file
+
+A configuration file can also be used to specify the required kernels. 
+The format is `<operator domain>;<opset for domain>;<op>[,op]...`
+
+The opset should match the opset import for each model. It does not need to match the initial ONNX opset that the operator was available in. 
+e.g. if a  model imports opset 12 of ONNX, all ONNX operators in that model should be listed under opset 12 for the 'ai.onnx' domain.
+
+Example config that could be used for a scenario with 2 simplistic models. One targeting ONNX opset 10 with an Add and Concat node, the other targeting ONNX opset 12 with an Add and Split node.
+
+```
+#domain;opset;op1,op2...
+ai.onnx;10;Add,Concat
+ai.onnx;12;Add,Split
+```
+
+## Reducing Build to Required Kernels
+
+There are two ways to reduce the kernels included in the build to the required ones.
+  - via build script arguments when building ORT
+    - the exclusion script will be run as part of the build process
+  - via directly running the exclusion script prior to building ORT
+
+NOTE: The exclusion script will only disable kernel registrations each time it runs. It will NOT re-enable previously disabled kernels. If you wish to change the list of kernels to include it is best to revert the repository to a clean state (`git reset --hard`) before running either the ORT build script or the exclusion script each time.
+
+### Build time reduction
+
+When running the ORT build script there are two arguments that can be used. These may be combined. 
+
+  - `--include_ops_by_model=<path to directory containing ONNX model/s\>`
+  - `--include_ops_by_config=<path to configuration file\>`
+
+`--skip_tests` MUST also be specified.
+
+See the ORT [build instructions](https://github.com/microsoft/onnxruntime/blob/master/BUILD.md#build-instructions) for more details.
+
+Most likely the `--config` value should be Release or MinSizeRel.
+
+### Pre-build reduction
+
+The script to reduce the kernel registrations can be found in `<ORT repository root>/tools/ci_build/exclude_unused_ops.py`.
+
+It can be run in a similar fashion. 
+`--model_path` is a path to a directory containing one or more ONNX models. Directory is recursively searched.
+`--file_path` is a path to a configuration file for the required operators
+`--ort_root` is the path to the ORT repository root that the kernel registration exclusions should be done in. If not provided it will default to be the repository containing the exclude_unused_ops.py script.
+
+```
+python exclude_unused_ops.py --model_path d:\ReduceSize\models --config_path d:\ReduceSize\ops_config.txt --ort_root d:\onnxruntime
+```
+
+After running the script build ORT as per the build instructions. Remember to specify `--skip-tests`.
+
diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h
index cd376a97993d9..96924eb847abb 100644
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@@ -149,10 +149,10 @@ void LogRuntimeError(uint32_t session_id, const common::Status& status, const ch
     }                                                                                          \
   } while (false)
 
-#define ORT_THROW_EX(ex, ...)                                                       \
-  do {                                                                              \
-    std::cerr << #ex << ::onnxruntime::MakeString(__VA_ARGS__) << ")" << std::endl; \
-    abort();                                                                        \
+#define ORT_THROW_EX(ex, ...)                                                              \
+  do {                                                                                     \
+    std::cerr << #ex << "(" << ::onnxruntime::MakeString(__VA_ARGS__) << ")" << std::endl; \
+    abort();                                                                               \
   } while (false)
 
 #else
@@ -182,7 +182,7 @@ void LogRuntimeError(uint32_t session_id, const common::Status& status, const ch
                                             ::onnxruntime::MakeString(__VA_ARGS__))
 
 #define ORT_THROW_EX(ex, ...) \
-  throw ex(##__VA_ARGS__)
+  throw ex(__VA_ARGS__)
 
 #endif
 
@@ -315,11 +315,4 @@ inline std::wstring ToWideString(const std::wstring& s) { return s; }
 inline std::string ToWideString(const std::string& s) { return s; }
 #endif
 
-// from http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3876.pdf
-template <class T>
-inline void HashCombine(std::uint64_t& seed, const T& v) {
-  std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/kernel_def_builder.h b/include/onnxruntime/core/framework/kernel_def_builder.h
index 44f2c59a6c536..4ce1d01af28e9 100644
--- a/include/onnxruntime/core/framework/kernel_def_builder.h
+++ b/include/onnxruntime/core/framework/kernel_def_builder.h
@@ -52,7 +52,7 @@ class KernelDef {
     return provider_type_;
   }
 
-  const std::unordered_map<std::string, std::vector<MLDataType>>& TypeConstraints() const {
+  const std::map<std::string, std::vector<MLDataType>>& TypeConstraints() const {
     return type_constraints_;
   }
 
@@ -98,31 +98,8 @@ class KernelDef {
  private:
   friend class KernelDefBuilder;
 
-  // call once the KernelDef has been built
-  void CalculateHash() {
-    // use name, start/end, domain, provider and the type constraints.
-    // we wouldn't have two kernels that only differed by the inplace or alias info or memory types.
-    // currently nothing sets exec_queue_id either (and would assumably be a runtime thing and not part of the base
-    // kernel definition)
-    hash_ = 0;  // reset in case this is called multiple times
-    HashCombine(hash_, op_name_);
-    HashCombine(hash_, op_since_version_start_);
-    // If we include op_since_version_end_ the hash of an existing op changes when it's superseded.
-    // e.g. Unsqueeze 11 had no end version until Unsqueeze 13, at which point the existing op is changed to have
-    // an end version of 12. That would result in a new ORT build having a different hash for Unsqueeze 11 and a
-    // previously serialized ORT format model wouldn't find the kernel. In order to select the kernel to include
-    // in the ORT model the full OpSchema info is used, so it's safe to exclude op_since_version_end_ from the hash.
-    // HashCombine(hash_, op_since_version_end_);
-    HashCombine(hash_, op_domain_);
-    HashCombine(hash_, provider_type_);
-    for (const auto& key_value : type_constraints_) {
-      HashCombine(hash_, key_value.first);
-      for (const auto& data_type : key_value.second) {
-        // need to construct a std::string so it doesn't hash the address of a const char*
-        HashCombine(hash_, std::string(DataTypeImpl::ToString(data_type)));
-      }
-    }
-  }
+  // called once by KernelDefBuilder::Build
+  void CalculateHash();
 
   // The operator name supported by <*this> kernel..
   std::string op_name_;
@@ -143,7 +120,8 @@ class KernelDef {
 
   // The supported data types for inputs/outputs.
   // Key is input/output name defined in op schema, Value are supported types.
-  std::unordered_map<std::string, std::vector<MLDataType>> type_constraints_;
+  // note: std::map as we need the order to be deterministic for the hash
+  std::map<std::string, std::vector<MLDataType>> type_constraints_;
 
   // An element <i, j> means that output j reuses the memory of input i.
   std::vector<std::pair<int, int>> inplace_map_;
diff --git a/onnxruntime/core/framework/kernel_def_builder.cc b/onnxruntime/core/framework/kernel_def_builder.cc
index 6496b94ac5793..8e2662388b72e 100644
--- a/onnxruntime/core/framework/kernel_def_builder.cc
+++ b/onnxruntime/core/framework/kernel_def_builder.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/framework/kernel_def_builder.h"
+#include "core/framework/murmurhash3.h"
+#include "gsl/gsl"
+
 #include <unordered_set>
 #include <string>
 
@@ -22,8 +25,44 @@ inline bool AreVectorsOverlap(const std::vector<T>& v1, const std::vector<T>& v2
   }
   return false;
 }
+
 }  // namespace
 
+void KernelDef::CalculateHash() {
+  uint32_t hash[4] = {0, 0, 0, 0};
+
+  auto hash_int = [&hash](int i) { MurmurHash3::x86_128(&i, sizeof(i), hash[0], &hash); };
+  auto hash_str = [&hash](const std::string& str) {
+    MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+  };
+
+  // use name, start/end, domain, provider and the type constraints.
+  // we wouldn't have two kernels that only differed by the inplace or alias info or memory types.
+  // currently nothing sets exec_queue_id either (and would assumably be a runtime thing and not part of the base
+  // kernel definition)
+
+  hash_str(op_name_);
+  hash_int(op_since_version_start_);
+
+  // If we include op_since_version_end_ the hash of an existing op changes when it's superseded.
+  // e.g. Unsqueeze 11 had no end version until Unsqueeze 13, at which point the existing op is changed to have
+  // an end version of 12. That would result in a new ORT build having a different hash for Unsqueeze 11 and a
+  // previously serialized ORT format model wouldn't find the kernel. In order to select the kernel to include
+  // in the ORT model the full OpSchema info is used, so it's safe to exclude op_since_version_end_ from the hash.
+
+  hash_str(op_domain_);
+  hash_str(provider_type_);
+  for (const auto& key_value : type_constraints_) {
+    hash_str(key_value.first);
+    for (const auto& data_type : key_value.second) {
+      hash_str(std::string(DataTypeImpl::ToString(data_type)));
+    }
+  }
+
+  hash_ = hash[0] & 0xfffffff8;  // save low 3 bits for hash version info in case we need it in the future
+  hash_ |= uint64_t(hash[1]) << 32;
+}
+
 // TODO: Tell user why it has conflicts
 // TODO: Investigate why IsConflict() was not triggered when there were duplicate Tile CUDA
 // kernels registered. Removing `InputMemoryType<OrtMemTypeCPUInput>(1)` in the kernel definition
diff --git a/onnxruntime/core/framework/murmurhash3.cc b/onnxruntime/core/framework/murmurhash3.cc
new file mode 100644
index 0000000000000..2b3ff6510ea04
--- /dev/null
+++ b/onnxruntime/core/framework/murmurhash3.cc
@@ -0,0 +1,311 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/murmurhash3.h"
+
+// Original source: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+/* Modifications Copyright (c) Microsoft. */
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x, y) _rotl(x, y)
+#define ROTL64(x, y) _rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else  // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32(uint32_t x, int8_t r) {
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64(uint64_t x, int8_t r) {
+  return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x, y) rotl32(x, y)
+#define ROTL64(x, y) rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif  // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) {
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) {
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+namespace onnxruntime {
+void MurmurHash3::x86_32(const void* key, int len,
+                         uint32_t seed, void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1, 13);
+    h1 = h1 * 5 + 0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
+
+  uint32_t k1 = 0;
+
+  switch (len & 3) {
+    case 3:
+      k1 ^= tail[2] << 16;  // Fallthrough.
+    case 2:
+      k1 ^= tail[1] << 8;  // Fallthrough.
+    case 1:
+      k1 ^= tail[0];
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3::x86_128(const void* key, int len, uint32_t seed, void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b;
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5;
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 16);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i * 4 + 0);
+    uint32_t k2 = getblock32(blocks, i * 4 + 1);
+    uint32_t k3 = getblock32(blocks, i * 4 + 2);
+    uint32_t k4 = getblock32(blocks, i * 4 + 3);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+    h1 ^= k1;
+
+    h1 = ROTL32(h1, 19);
+    h1 += h2;
+    h1 = h1 * 5 + 0x561ccd1b;
+
+    k2 *= c2;
+    k2 = ROTL32(k2, 16);
+    k2 *= c3;
+    h2 ^= k2;
+
+    h2 = ROTL32(h2, 17);
+    h2 += h3;
+    h2 = h2 * 5 + 0x0bcaa747;
+
+    k3 *= c3;
+    k3 = ROTL32(k3, 17);
+    k3 *= c4;
+    h3 ^= k3;
+
+    h3 = ROTL32(h3, 15);
+    h3 += h4;
+    h3 = h3 * 5 + 0x96cd1c35;
+
+    k4 *= c4;
+    k4 = ROTL32(k4, 18);
+    k4 *= c1;
+    h4 ^= k4;
+
+    h4 = ROTL32(h4, 13);
+    h4 += h1;
+    h4 = h4 * 5 + 0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch (len & 15) {
+    case 15:
+      k4 ^= tail[14] << 16;  // Fallthrough.
+    case 14:
+      k4 ^= tail[13] << 8;  // Fallthrough.
+    case 13:
+      k4 ^= tail[12] << 0;
+      k4 *= c4;
+      k4 = ROTL32(k4, 18);
+      k4 *= c1;
+      h4 ^= k4;  // Fallthrough.
+
+    case 12:
+      k3 ^= tail[11] << 24;  // Fallthrough.
+    case 11:
+      k3 ^= tail[10] << 16;  // Fallthrough.
+    case 10:
+      k3 ^= tail[9] << 8;  // Fallthrough.
+    case 9:
+      k3 ^= tail[8] << 0;
+      k3 *= c3;
+      k3 = ROTL32(k3, 17);
+      k3 *= c4;
+      h3 ^= k3;  // Fallthrough.
+
+    case 8:
+      k2 ^= tail[7] << 24;  // Fallthrough.
+    case 7:
+      k2 ^= tail[6] << 16;  // Fallthrough.
+    case 6:
+      k2 ^= tail[5] << 8;  // Fallthrough.
+    case 5:
+      k2 ^= tail[4] << 0;
+      k2 *= c2;
+      k2 = ROTL32(k2, 16);
+      k2 *= c3;
+      h2 ^= k2;  // Fallthrough.
+
+    case 4:
+      k1 ^= tail[3] << 24;  // Fallthrough.
+    case 3:
+      k1 ^= tail[2] << 16;  // Fallthrough.
+    case 2:
+      k1 ^= tail[1] << 8;  // Fallthrough.
+    case 1:
+      k1 ^= tail[0] << 0;
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+  h2 ^= len;
+  h3 ^= len;
+  h4 ^= len;
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/murmurhash3.h b/onnxruntime/core/framework/murmurhash3.h
new file mode 100644
index 0000000000000..ab86a3e591adf
--- /dev/null
+++ b/onnxruntime/core/framework/murmurhash3.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace onnxruntime {
+struct MurmurHash3 {
+  // generate 32-bit hash from input and write to 'out'
+  static void x86_32(const void* key, int len, uint32_t seed, void* out);
+
+  // generate 128-bit hash from input and write to 'out'.
+  static void x86_128(const void* key, int len, uint32_t seed, void* out);
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/common/logging/logging_test.cc b/onnxruntime/test/common/logging/logging_test.cc
index 4c73846ca7b2b..ade19dc27d95c 100644
--- a/onnxruntime/test/common/logging/logging_test.cc
+++ b/onnxruntime/test/common/logging/logging_test.cc
@@ -145,6 +145,7 @@ TEST_F(LoggingTestsFixture, TestLoggerFiltering) {
 /// <summary>
 /// Tests that the logging manager constructor validates its usage correctly.
 /// </summary>
+#if !defined(ORT_NO_EXCEPTIONS)
 TEST_F(LoggingTestsFixture, TestLoggingManagerCtor) {
   // throw if sink is null
   EXPECT_THROW((LoggingManager{std::unique_ptr<ISink>{nullptr}, Severity::kINFO, false,
@@ -157,6 +158,7 @@ TEST_F(LoggingTestsFixture, TestLoggingManagerCtor) {
                                InstanceType::Default}),
                ::onnxruntime::OnnxRuntimeException);
 }
+#endif
 
 /// <summary>
 /// Tests that the conditional logging macros work correctly.
diff --git a/onnxruntime/test/common/path_test.cc b/onnxruntime/test/common/path_test.cc
index dbd9990c0d878..955c83a87da22 100644
--- a/onnxruntime/test/common/path_test.cc
+++ b/onnxruntime/test/common/path_test.cc
@@ -55,33 +55,33 @@ TEST(PathTest, Parse) {
 }
 
 TEST(PathTest, ParseFailure) {
-    auto check_parse_failure =
-        [](const std::string& path_string) {
+  auto check_parse_failure =
+      [](const std::string& path_string) {
         Path p{};
         EXPECT_FALSE(Path::Parse(ToPathString(path_string), p).IsOK());
       };
 
 #ifdef _WIN32
-    check_parse_failure(R"(\\server_name_no_separator)");
-    check_parse_failure(R"(\\server_name_no_share_name\)");
-    check_parse_failure(R"(\\server_name\share_name_no_root_dir)");
+  check_parse_failure(R"(\\server_name_no_separator)");
+  check_parse_failure(R"(\\server_name_no_share_name\)");
+  check_parse_failure(R"(\\server_name\share_name_no_root_dir)");
 #else  // POSIX
-    check_parse_failure("//root_name_no_root_dir");
+  check_parse_failure("//root_name_no_root_dir");
 #endif
 }
 
 TEST(PathTest, IsEmpty) {
-    auto check_empty =
-        [](const std::string& path_string, bool is_empty) {
+  auto check_empty =
+      [](const std::string& path_string, bool is_empty) {
         Path p{};
         ASSERT_STATUS_OK(Path::Parse(ToPathString(path_string), p));
 
         EXPECT_EQ(p.IsEmpty(), is_empty);
       };
 
-    check_empty("", true);
-    check_empty(".", false);
-    check_empty("/", false);
+  check_empty("", true);
+  check_empty(".", false);
+  check_empty("/", false);
 }
 
 TEST(PathTest, IsAbsoluteOrRelative) {
@@ -223,6 +223,7 @@ TEST(PathTest, RelativePathFailure) {
 #endif
 }
 
+#if !defined(ORT_NO_EXCEPTIONS)
 TEST(PathTest, Concat) {
   auto check_concat =
       [](const optional<std::string>& a, const std::string& b, const std::string& expected_a, bool expect_throw = false) {
@@ -249,6 +250,7 @@ TEST(PathTest, Concat) {
   check_concat({"a/b"}, "c/d", "", true /* expect_throw */);
 #endif
 }
+#endif
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 40bebce0243f4..4c53808bba5ee 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -208,11 +208,7 @@ TEST(OrtModelOnlyTests, SerializeToOrtFormat) {
 #endif
 
 // test that we can deserialize and run a previously saved ORT format model
-// TEMPORARY
-// This works locally when loading the model produced by SerializeToOrtFormat but fails to find the kernel
-// for Loop if using the pre-saved model in testdata despite there being no binary difference between the two.
-// The hash for Loop is correct (14070537928877630320) according the the CI failure error message.
-TEST(OrtModelOnlyTests, DISABLED_LoadOrtFormatModel) {
+TEST(OrtModelOnlyTests, LoadOrtFormatModel) {
   const auto model_filename = ORT_TSTR("testdata/ort_github_issue_4031.onnx.ort");
   SessionOptions so;
   so.session_logid = "LoadOrtFormatModel";
diff --git a/onnxruntime/test/opaque_api/test_opaque_api.cc b/onnxruntime/test/opaque_api/test_opaque_api.cc
index 7a8ccca76d0d9..d6c495a0c8b6b 100644
--- a/onnxruntime/test/opaque_api/test_opaque_api.cc
+++ b/onnxruntime/test/opaque_api/test_opaque_api.cc
@@ -13,7 +13,7 @@
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "gtest/gtest.h"
-#include "onnx/defs/schema.h"
+#include "core/graph/onnx_protobuf.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/framework/test_utils.h"
 
diff --git a/onnxruntime/test/platform/windows/stacktrace_test.cc b/onnxruntime/test/platform/windows/stacktrace_test.cc
index 2ee24a5ca239d..a3492644a2ed5 100644
--- a/onnxruntime/test/platform/windows/stacktrace_test.cc
+++ b/onnxruntime/test/platform/windows/stacktrace_test.cc
@@ -15,7 +15,7 @@ namespace test {
 
 using namespace ::testing;
 //TVM is not working with StackTrace now.
-#if !(defined USE_TVM || (defined USE_NGRAPH && defined _WIN32))
+#if !(defined USE_TVM || (defined USE_NGRAPH && defined _WIN32)) && !defined(ORT_NO_EXCEPTIONS)
 TEST(StacktraceTests, BasicTests) {
   auto result = ::onnxruntime::GetStackTrace();
 
diff --git a/onnxruntime/test/shared_lib/test_model_loading.cc b/onnxruntime/test/shared_lib/test_model_loading.cc
index cac62b534e4eb..634e67dba61aa 100644
--- a/onnxruntime/test/shared_lib/test_model_loading.cc
+++ b/onnxruntime/test/shared_lib/test_model_loading.cc
@@ -8,11 +8,16 @@
 #include <fstream>
 #include "test_fixture.h"
 #include "file_util.h"
+
+#include "gmock/gmock.h"
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
 namespace test {
 
+// disable for minimal build with no exceptions as it will always attempt to throw in that scenario
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_NO_EXCEPTIONS)
 TEST(CApiTest, model_from_array) {
   const char* model_path = "testdata/matmul_1.onnx";
   std::vector<char> buffer;
@@ -26,14 +31,31 @@ TEST(CApiTest, model_from_array) {
       ORT_THROW("Error reading model");
   }
 
+#if (!ORT_MINIMAL_BUILD)
+  bool should_throw = false;
+#else
+  bool should_throw = true;
+#endif
+
+  auto create_session = [&](Ort::SessionOptions& so) {
+    try {
+      Ort::Session session(*ort_env.get(), buffer.data(), buffer.size(), so);
+      ASSERT_FALSE(should_throw) << "Creation of session should have thrown";
+    } catch (const std::exception& ex) {
+      ASSERT_TRUE(should_throw) << "Creation of session should not have thrown. Exception:" << ex.what();
+      ASSERT_THAT(ex.what(), testing::HasSubstr("ONNX format model is not supported in this build."));
+    }
+  };
+
   Ort::SessionOptions so;
-  Ort::Session session(*ort_env.get(), buffer.data(), buffer.size(), so);
+  create_session(so);
 
 #ifdef USE_CUDA
   // test with CUDA provider when using onnxruntime as dll
   Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(so, 0));
-  Ort::Session session_cuda(*ort_env.get(), buffer.data(), buffer.size(), so);
+  create_session(so);
 #endif
 }
+#endif
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index af9c920a1f70d..a1dd73e589bed 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -195,6 +195,8 @@ TEST(CApiTest, TypeInfoMap) {
   map_value_type_info.release();
   map_type_info.release();
 #else
+
+#if !defined(ORT_NO_EXCEPTIONS)
   // until https://github.com/google/googletest/pull/2904/ makes it into a release,
   // check an exception is thrown with the expected message the ugly way
   try {
@@ -204,6 +206,7 @@ TEST(CApiTest, TypeInfoMap) {
     ASSERT_THAT(ex.what(), testing::HasSubstr("Map type is not supported in this build"));
   }
 #endif
+#endif
 }
 
 TEST(CApiTest, CreateGetSeqTensors) {
diff --git a/onnxruntime/test/testdata/ort_github_issue_4031.onnx.ort b/onnxruntime/test/testdata/ort_github_issue_4031.onnx.ort
index 80cad60eb299f..cc8cb16f6ac12 100644
Binary files a/onnxruntime/test/testdata/ort_github_issue_4031.onnx.ort and b/onnxruntime/test/testdata/ort_github_issue_4031.onnx.ort differ
diff --git a/onnxruntime/test/util/file_util.cc b/onnxruntime/test/util/file_util.cc
index f3f757761c1ce..a4507ac0d1db2 100644
--- a/onnxruntime/test/util/file_util.cc
+++ b/onnxruntime/test/util/file_util.cc
@@ -20,24 +20,28 @@ void DeleteFileFromDisk(const ORTCHAR_T* path) {
 #endif
 }
 void CreateTestFile(int& out, std::basic_string<ORTCHAR_T>& filename_template) {
-  if (filename_template.empty()) throw std::runtime_error("file name template can't be empty");
+  if (filename_template.empty())
+    ORT_THROW_EX(std::runtime_error, "file name template can't be empty");
+
   ORTCHAR_T* filename = const_cast<ORTCHAR_T*>(filename_template.c_str());
 #ifdef _WIN32
   ASSERT_EQ(0, _wmktemp_s(filename, filename_template.length() + 1));
   int fd;
   int err = _wsopen_s(&fd, filename, _O_CREAT | _O_EXCL | _O_SEQUENTIAL | _O_BINARY | _O_WRONLY, _SH_DENYRW, _S_IREAD | _S_IWRITE);
   if (err != 0)
-    throw std::runtime_error("open temp file failed");
+    ORT_THROW_EX(std::runtime_error, "open temp file failed");
 #else
   int fd = mkstemp(filename);
   if (fd < 0) {
-    throw std::runtime_error("open temp file failed");
+    ORT_THROW_EX(std::runtime_error, "open temp file failed");
   }
 #endif
   out = fd;
 }
 void CreateTestFile(FILE*& out, std::basic_string<ORTCHAR_T>& filename_template) {
-  if (filename_template.empty()) throw std::runtime_error("file name template can't be empty");
+  if (filename_template.empty())
+    ORT_THROW_EX(std::runtime_error, "file name template can't be empty");
+
   ORTCHAR_T* filename = const_cast<ORTCHAR_T*>(filename_template.c_str());
 #ifdef _WIN32
   ASSERT_EQ(0, _wmktemp_s(filename, filename_template.length() + 1));
@@ -46,11 +50,11 @@ void CreateTestFile(FILE*& out, std::basic_string<ORTCHAR_T>& filename_template)
 #else
   int fd = mkstemp(filename);
   if (fd < 0) {
-    throw std::runtime_error("open temp file failed");
+    ORT_THROW_EX(std::runtime_error, "open temp file failed");
   }
   FILE* fp = fdopen(fd, "w");
 #endif
   out = fp;
 }
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index c3cbc00420201..61fe500f35465 100755
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -287,15 +287,7 @@ def parse_arguments():
     parser.add_argument(
         "--use_full_protobuf", action='store_true',
         help="Use the full protobuf library")
-    parser.add_argument(
-        "--disable_contrib_ops", action='store_true',
-        help="Disable contrib ops (reduces binary size)")
-    parser.add_argument(
-        "--disable_ml_ops", action='store_true',
-        help="Disable traditional ML ops (reduces binary size)")
-    parser.add_argument(
-        "--disable_rtti", action='store_true',
-        help="Disable RTTI (reduces binary size)")
+
     parser.add_argument(
         "--skip_onnx_tests", action='store_true', help="Explicitly disable "
         "all onnx related tests. Note: Use --skip_tests to skip all tests.")
@@ -355,12 +347,25 @@ def parse_arguments():
     parser.add_argument(
         "--build_micro_benchmarks", action='store_true',
         help="Build ONNXRuntime micro-benchmarks.")
-    parser.add_argument(
-        "--include_ops_by_model", type=str,
-        help="include ops from model(s) under designated path.")
-    parser.add_argument(
-        "--include_ops_by_file", type=str,
-        help="include ops from csv file.")
+
+    # options to reduce binary size
+    parser.add_argument("--minimal_build", action='store_true',
+                        help="Create a build that only supports ORT format models. "
+                        "See /docs/ONNX_Runtime_Format_Model_Usage.md for more information. "
+                        "RTTI is automatically disabled in a minimal build.")
+    parser.add_argument("--include_ops_by_model", type=str, help="include ops from model(s) under designated path.")
+    parser.add_argument("--include_ops_by_config", type=str,
+                        help="include ops from config file. "
+                        "See /docs/Reduced_Operator_Kernel_build.md for more information.")
+
+    parser.add_argument("--disable_contrib_ops", action='store_true',
+                        help="Disable contrib ops (reduces binary size)")
+    parser.add_argument("--disable_ml_ops", action='store_true',
+                        help="Disable traditional ML ops (reduces binary size)")
+    parser.add_argument("--disable_rtti", action='store_true', help="Disable RTTI (reduces binary size)")
+    parser.add_argument("--disable_exceptions", action='store_true',
+                        help="Disable exceptions to reduce binary size. Requires --minimal_build.")
+
     return parser.parse_args()
 
 
@@ -617,12 +622,13 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
         # script).
         "-Donnxruntime_CROSS_COMPILING=" + (
             "ON" if args.arm64 or args.arm else "OFF"),
-        "-Donnxruntime_DISABLE_CONTRIB_OPS=" + (
-            "ON" if args.disable_contrib_ops else "OFF"),
-        "-Donnxruntime_DISABLE_ML_OPS=" + (
-            "ON" if args.disable_ml_ops else "OFF"),
-        "-Donnxruntime_DISABLE_RTTI=" + (
-            "ON" if args.disable_rtti else "OFF"),
+        "-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"),
+        "-Donnxruntime_DISABLE_ML_OPS=" + ("ON" if args.disable_ml_ops else "OFF"),
+        "-Donnxruntime_DISABLE_RTTI=" + ("ON" if args.disable_rtti else "OFF"),
+        "-Donnxruntime_DISABLE_EXCEPTIONS=" + ("ON" if args.disable_exceptions else "OFF"),
+        "-Donnxruntime_MINIMAL_BUILD=" + ("ON" if args.minimal_build else "OFF"),
+        "-Donnxruntime_REDUCED_OPS_BUILD=" + (
+            "ON" if args.include_ops_by_config or args.include_ops_by_model else "OFF"),
         "-Donnxruntime_MSVC_STATIC_RUNTIME=" + (
             "ON" if args.enable_msvc_static_runtime else "OFF"),
         # enable pyop if it is nightly build
@@ -1224,20 +1230,21 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             ctest_cmd = [ctest_path, "--build-config", config, "--verbose"]
             run_subprocess(ctest_cmd, cwd=cwd, dll_path=dll_path)
 
-        if args.enable_pybind and not\
-           args.include_ops_by_model and not\
-           args.include_ops_by_file:
-
+        if args.enable_pybind:
             # Disable python tests for TensorRT because many tests are
             # not supported yet.
             if args.use_tensorrt:
                 return
+
+            # Disable python tests in a reduced build as we don't know which ops have been included and which
+            # models can run
+            if args.include_ops_by_model or args.include_ops_by_config or args.minimal_build:
+                return
+
             if is_windows():
                 cwd = os.path.join(cwd, config)
 
-            run_subprocess(
-                [sys.executable, 'onnxruntime_test_python.py'],
-                cwd=cwd, dll_path=dll_path)
+            run_subprocess([sys.executable, 'onnxruntime_test_python.py'], cwd=cwd, dll_path=dll_path)
 
             # For CUDA enabled builds test IOBinding feature
             # Limit testing to Windows non-ARM builds for now
@@ -1277,34 +1284,24 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 onnx_test = True
             except ImportError as error:
                 log.exception(error)
-                log.warning(
-                    "onnx is not installed. "
-                    "The ONNX tests will be skipped.")
+                log.warning("onnx is not installed. The ONNX tests will be skipped.")
                 onnx_test = False
 
             if onnx_test:
-                run_subprocess(
-                    [sys.executable, 'onnxruntime_test_python_backend.py'],
-                    cwd=cwd, dll_path=dll_path)
+                run_subprocess([sys.executable, 'onnxruntime_test_python_backend.py'], cwd=cwd, dll_path=dll_path)
 
                 if not args.disable_ml_ops:
-                    run_subprocess(
-                        [sys.executable, 'onnxruntime_test_python_backend_mlops.py'],
-                        cwd=cwd, dll_path=dll_path)
+                    run_subprocess([sys.executable, 'onnxruntime_test_python_backend_mlops.py'],
+                                   cwd=cwd, dll_path=dll_path)
+
+                run_subprocess([sys.executable,
+                                os.path.join(source_dir, 'onnxruntime', 'test', 'onnx', 'gen_test_models.py'),
+                                '--output_dir', 'test_models'], cwd=cwd)
 
-                run_subprocess(
-                    [sys.executable,
-                     os.path.join(source_dir, 'onnxruntime', 'test', 'onnx',
-                                  'gen_test_models.py'),
-                     '--output_dir', 'test_models'], cwd=cwd)
                 if not args.skip_onnx_tests:
-                    run_subprocess(
-                        [os.path.join(cwd, 'onnx_test_runner'), 'test_models'],
-                        cwd=cwd)
+                    run_subprocess([os.path.join(cwd, 'onnx_test_runner'), 'test_models'], cwd=cwd)
                 if config != 'Debug':
-                    run_subprocess(
-                        [sys.executable, 'onnx_backend_test_series.py'],
-                        cwd=cwd, dll_path=dll_path)
+                    run_subprocess([sys.executable, 'onnx_backend_test_series.py'], cwd=cwd, dll_path=dll_path)
 
             if not args.skip_keras_test:
                 try:
@@ -1617,13 +1614,11 @@ def main():
     if args.skip_tests:
         args.test = False
 
-    if args.include_ops_by_model or args.include_ops_by_file:
-
+    if args.include_ops_by_model or args.include_ops_by_config:
         from exclude_unused_ops import exclude_unused_ops, get_provider_path
         include_ops_by_model = args.include_ops_by_model if args.include_ops_by_model else ''
-        include_ops_by_file = args.include_ops_by_file if args.include_ops_by_file else ''
-        exclude_unused_ops(include_ops_by_model, include_ops_by_file, get_provider_path(use_cuda=args.use_cuda))
-        cmake_extra_defines.append('onnxruntime_REDUCED_OPS_BUILD=ON')
+        include_ops_by_config = args.include_ops_by_config if args.include_ops_by_config else ''
+        exclude_unused_ops(include_ops_by_model, include_ops_by_config, get_provider_path(use_cuda=args.use_cuda))
 
     if args.use_tensorrt:
         args.use_cuda = True
@@ -1635,8 +1630,10 @@ def main():
         args.build_shared_lib = True
 
     if args.build_nuget and cross_compiling:
-        raise BuildError(
-                'Currently nuget package creation is not supported while cross-compiling')
+        raise BuildError('Currently nuget package creation is not supported while cross-compiling')
+
+    if args.enable_pybind and args.disable_exceptions:
+        raise BuildError('Python bindings require exceptions to be enabled.')
 
     # Disabling unit tests for VAD-F as FPGA only supports
     # models with NCHW layout
diff --git a/tools/ci_build/exclude_unused_ops.py b/tools/ci_build/exclude_unused_ops.py
index b36fba40c317a..31a045477e749 100644
--- a/tools/ci_build/exclude_unused_ops.py
+++ b/tools/ci_build/exclude_unused_ops.py
@@ -3,6 +3,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import sys
 import os
 import argparse
 import shutil
@@ -30,8 +31,8 @@ def map_domain(domain):
     return 'UnknownDomain'
 
 
-def extract_ops_from_file(file_path, referred_ops):
-    '''extract ops from file of format: domain;opset;op1,op2...'''
+def extract_ops_from_config(file_path, referred_ops):
+    '''extract ops from config file of format: domain;opset;op1,op2...'''
 
     if not file_path:
         return referred_ops
@@ -123,15 +124,13 @@ def extract_ops_from_graph(graph, operators, domain_opset_map):
     return referred_ops  # end of extract_ops_from_model(...)
 
 
-def exclude_unused_ops(model_path, file_path, provider_paths):
+def exclude_unused_ops(model_path, config_path, provider_paths):
     '''rewrite multiple provider files'''
 
-    operators = extract_ops_from_file(file_path, extract_ops_from_model(model_path, {}))
+    operators = extract_ops_from_config(config_path, extract_ops_from_model(model_path, {}))
     for provider_path in provider_paths:
         exclude_unused_ops_in_provider(operators, provider_path)
 
-    # end of disable_ops_in_providers(...)
-
 
 def exclude_unused_ops_in_provider(operators, provider_path):
     '''rewrite provider file to exclude unused ops'''
@@ -280,7 +279,7 @@ def process_lines(lines, offset):
     # end of rewrite_cpu_provider(...)
 
 
-def get_provider_path(ort_root='', use_cuda=False):
+def get_provider_path(ort_root=None, use_cuda=False):
     '''return paths to cpu and cuda providers'''
 
     if not ort_root:
@@ -300,34 +299,34 @@ def get_provider_path(ort_root='', use_cuda=False):
 
 if __name__ == "__main__":
 
-    PARSER = argparse.ArgumentParser(
-        description="provider rewriter",
-        usage="""
-        --model_path <path to model(s) folder>
-        --file_path <path to file whose line formated like 'domain;opset;op1,op2...'>
-        --ort_root <path to ort root with current as default>
-        """)
+    parser = argparse.ArgumentParser(
+        description="Script to exclude unused operator kernels by disabling their registration in ONNXRuntime."
+        "See /docs/Reduced_Operator_Kernel_build.md for more information",
+        usage="Provide either model_path or config_path, or both.")
 
-    PARSER.add_argument(
-        "--model_path", type=str, help="path to model(s) folder")
+    parser.add_argument(
+        "--model_path", type=str, help="path to folder containing one or more ONNX models")
 
-    PARSER.add_argument(
-        "--file_path", type=str, help="path to file of ops")
+    parser.add_argument(
+        "--config_path", type=str, help="path to configuration file with format of 'domain;opset;op1,op2...'")
 
-    PARSER.add_argument(
-        "--ort_root", type=str, help="path to ort root with current as default")
+    parser.add_argument(
+        "--ort_root", type=str, help="path to ONNXRuntime repository root. "
+                                     "Inferred from the location of this script if not provided.")
 
-    ARGS = PARSER.parse_args()
+    args = parser.parse_args()
 
-    model_path = os.path.abspath(ARGS.model_path) if ARGS.model_path else ''
-    file_path = os.path.abspath(ARGS.file_path) if ARGS.file_path else ''
-    ort_root = os.path.abspath(ARGS.ort_root) if ARGS.ort_root else ''
+    model_path = os.path.abspath(args.model_path) if args.model_path else ''
+    config_path = os.path.abspath(args.config_path) if args.config_path else ''
+    ort_root = os.path.abspath(args.ort_root) if args.ort_root else ''
 
-    if not model_path and not file_path:
-        log.warning('Please specify at least either model path or file path.')
+    if not model_path and not config_path:
+        log.error('Please specify at least either model path or file path.')
+        parser.print_help()
+        sys.exit(-1)
 
     if not ort_root:
-        log.info('ort root not specified, taking current as root')
+        log.info('ort_root was not specified. Inferring ORT root from location of this script.')
 
-    exclude_unused_ops(model_path, file_path,
+    exclude_unused_ops(model_path, config_path,
                        get_provider_path(ort_root, use_cuda=True))
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
index 44e3d31e8cdc5..ba731861d8d02 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
@@ -51,7 +51,7 @@ jobs:
     displayName: 'Build and test'
     inputs:
       scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-      arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=10.1 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1" --cudnn_home="C:\local\cudnn-10.1-windows10-x64-v7.6.5.32\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --include_ops_by_model="$(Build.SourcesDirectory)\onnxruntime\test\testdata" --include_ops_by_file="$(Build.SourcesDirectory)\onnxruntime\test\testdata\reduced_ops_via_config.config"'
+      arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=10.1 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1" --cudnn_home="C:\local\cudnn-10.1-windows10-x64-v7.6.5.32\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --include_ops_by_model="$(Build.SourcesDirectory)\onnxruntime\test\testdata" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\reduced_ops_via_config.config"'
       workingDirectory: '$(Build.BinariesDirectory)'
 
   - template: templates/component-governance-component-detection-steps.yml
diff --git a/tools/python/convert_onnx_model_to_ort.py b/tools/python/convert_onnx_model_to_ort.py
new file mode 100644
index 0000000000000..ca2c0477cdf38
--- /dev/null
+++ b/tools/python/convert_onnx_model_to_ort.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import onnxruntime as ort
+import os
+import re
+
+
+def convert(model: str):
+
+    if not model.endswith('.onnx'):
+        raise ValueError("Model filename must end in .onnx.")
+
+    onnx_target_path = re.sub('.onnx$', '.optimized.onnx', model)
+    ort_target_path = re.sub('.onnx$', '.ort', model)
+
+    so = ort.SessionOptions()
+    so.optimized_model_filepath = onnx_target_path
+    so.add_session_config_entry('session.save_model_format', 'ONNX')
+    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED  # Skip NCHWc optimizations
+
+    print("Optimizing ONNX model {} and saving in ONNX format to {}".format(model, onnx_target_path))
+    # creating the session will result in the optimized model being saved
+    _ = ort.InferenceSession(model, sess_options=so)
+
+    # Second, convert optimized ONNX model to ORT format
+    so.optimized_model_filepath = ort_target_path
+    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL  # Convert model as-is so we don't change the kernels in this step # noqa
+
+    so.add_session_config_entry('session.save_model_format', 'ORT')
+
+    print("Converting optimized ONNX model {} to ORT format model {}".format(onnx_target_path, ort_target_path))
+    _ = ort.InferenceSession(onnx_target_path, sess_options=so)
+
+    orig_size = os.path.getsize(onnx_target_path)
+    new_size = os.path.getsize(ort_target_path)
+    print("Serialized {} to {}. Sizes: orig={} new={} diff={} new:old={:.4f}:1.0".format(
+        onnx_target_path, ort_target_path, orig_size, new_size, new_size - orig_size, new_size / orig_size))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(os.path.basename(__file__),
+                                     description='''Convert an onnx model -> optimized onnx model -> ORT format model.
+                                     Expects a .onnx file as input. Optimized onnx model will be saved in the same
+                                     directory with an extension of .optimized.onnx.
+                                     An ORT format model will be created from the optimized onnx model.
+                                     The optimized onnx model should be used as input to a minimal build so that
+                                     any post-optimization kernels are included in the build.'''
+                                     )
+    parser.add_argument('model', help='Provide path to ONNX model to convert. Must have .onnx extension.')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    convert(args.model)
+
+
+if __name__ == '__main__':
+    main()