Add whisper pipeline - Initial commit (openvinotoolkit#789)

This is work in progress PR. Todos: - [x] use WhisperFeatureExtractor for audio preprocessing - [x] compute `assets/whisper/mel_filters_data.bin` on initialization - [x] move wav reader to sample utils - [ ] Longer audio inputs (>30s) chunking border poor quality results. Long audio inputs splitted by 30s chunks. This leads to a loss of context on a chunking border. This could be partially solved by [chunking with stride](https://huggingface.co/blog/asr-chunking). - [ ] add perf metrics - [x] update docstrings - [ ] update documentation - [x] add python bindings - [x] add tests - [ ] add cpp, python samples tests - [x] fix win build - [x] fetch `dr_wav.h` with `FetchContent` - [ ] support different languages, language autodetection - [ ] support translation - [ ] support timestamps - [x] remove constructor with infer requests - [x] rename pipeline to WhisperPipeline - [ ] Whisper pipeline doesn't need tokenizer, it uses detokenizer only. Implement detokenizer only initialization for `ov::genai::Tokenizer` - [ ] Check discrete GPU. Integrated GPU works as expected. - [ ] Investigate use of `RemoteTensor` for GPU - [ ] Add batch - [ ] Add sampler, inherit WhisperGenerationConfig from GenerationConfig Current limitations: - No resampling during preprocessing. Input raw speech should have 16k Hz sampling rate - No normalization during preprocessing. Input raw speech should be normalized to near [-1, 1] range Tickets: CVS-147994, CVS-146010, CVS-152522
roborags · Sep 19, 2024 · 7b81bcb · 7b81bcb
1 parent d831e64
commit 7b81bcb
Show file tree

Hide file tree

Showing 31 changed files with 2,511 additions and 178 deletions.
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -260,15 +260,102 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/tools --upgrade-strategy eager
-          python -m pytest ./tests/python_tests
+          python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
       - name: Test bindings (wheel)
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/tools
-          python -m pytest ./tests/python_tests
+          python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py
+
+  genai_python_lib_whisper:
+    name: OpenVINO genai extension whisper tests (cmake + wheel)
+    needs: [ openvino_download, openvino_build ]
+    if: |
+      always() &&
+      (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
+    timeout-minutes: 90
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-20.04
+    env:
+      CMAKE_GENERATOR: Unix Makefiles
+      CMAKE_BUILD_PARALLEL_LEVEL: null
+      OV_INSTALL_DIR: ${{ github.workspace }}/ov
+      CCACHE_DIR: ${{ github.workspace }}/ccache
+      CCACHE_MAXSIZE: 500Mi
+      CMAKE_CXX_COMPILER_LAUNCHER: ccache
+      CMAKE_C_COMPILER_LAUNCHER: ccache
+
+    steps:
+      - name: Clone openvino.genai
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: 'pip'
+
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@v4
+        with:
+          name: openvino_package
+          path: ${{ env.OV_INSTALL_DIR }}
+
+      - name: Extract OpenVINO packages
+        run: |
+          pushd ${OV_INSTALL_DIR}
+            tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1
+          popd
+
+      - name: Set apt
+        run: |
+          echo 'Acquire::Retries "10";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null
+          echo 'APT::Get::Assume-Yes "true";' | sudo tee -a /etc/apt/apt.conf.d/81-assume-yes > /dev/null
+          echo 'APT::Get::Fix-Broken "true";' | sudo tee -a /etc/apt/apt.conf.d/82-fix-broken > /dev/null
+          echo 'APT::Get::no-install-recommends "true";' | sudo tee -a /etc/apt/apt.conf.d/83-no-reсommends > /dev/null
+
+      - name: Install build dependencies
+        run: |
+          sudo ${OV_INSTALL_DIR}/install_dependencies/install_openvino_dependencies.sh
+          sudo apt-get install ccache
+
+      - name: Setup ccache
+        uses: actions/cache@v4
+        with:
+          # Should save cache only if run in the master branch of the base repo
+          # github.ref_name is 'ref/PR_#' in case of the PR, and 'branch_name' when executed on push
+          save-always: ${{ github.ref_name == 'master' && 'true' || 'false'  }}
+          path: ${{ env.CCACHE_DIR }}
+          key: ${{ runner.os }}-${{ runner.arch }}-ccache-genai-release-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-ccache-genai-release
+
+      - name: Build genai
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
+
+      - name: Test bindings
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/tools --upgrade-strategy eager
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
+        env:
+          PYTHONPATH: "./build/:$PYTHONPATH"
+
+      - name: Test bindings (wheel)
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/tools
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -227,7 +227,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/tools --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/
+          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -236,7 +236,71 @@ jobs:
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/tools
           python -c "from openvino_genai import LLMPipeline"
-          python -m pytest ./tests/python_tests/
+          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py
+
+  genai_python_lib_whisper:
+    name: OpenVINO genai extension whisper tests (cmake + wheel)
+    needs: [ openvino_download, openvino_build ]
+    if: |
+      always() &&
+      (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
+    timeout-minutes: 90
+    defaults:
+      run:
+        shell: bash
+    runs-on: macos-13
+
+    env:
+      OV_INSTALL_DIR: ${{ github.workspace }}/ov
+      MACOSX_DEPLOYMENT_TARGET: '11.0'
+
+    steps:
+      - name: Clone openvino.genai
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: 'pip'
+
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@v4
+        with:
+          name: openvino_package
+          path: ${{ env.OV_INSTALL_DIR }}
+
+      - name: Extract OpenVINO packages
+        run: |
+          pushd ${OV_INSTALL_DIR}
+            tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1
+          popd
+
+      - name: Install build dependencies
+        run: brew install coreutils ninja scons
+
+      - name: Build genai
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
+
+      - name: Test bindings
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/tools --upgrade-strategy eager
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
+        env:
+          PYTHONPATH: "./build/:$PYTHONPATH"
+
+      - name: Test bindings (wheel)
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/tools
+          python -c "from openvino_genai import LLMPipeline"
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -237,15 +237,79 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/tools --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/
+          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose
-          python -m pytest ./tests/python_tests/
+          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py
+
+  genai_python_lib_whisper:
+    name: OpenVINO genai extension whisper tests (cmake + wheel)
+    needs: [ openvino_download, openvino_build ]
+    if: |
+      always() &&
+      (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
+    timeout-minutes: 90
+    defaults:
+      run:
+        shell: pwsh
+    runs-on: windows-2019
+
+    env:
+      OV_INSTALL_DIR: ${{ github.workspace }}\\ov
+      CMAKE_BUILD_PARALLEL_LEVEL: null
+
+    steps:
+      - name: Clone openvino.genai
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: 'pip'
+
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@v4
+        with:
+          name: openvino_package
+          path: ${{ env.OV_INSTALL_DIR }}
+
+      - name: Extract OpenVINO packages
+        run: |
+          pushd ${{ env.OV_INSTALL_DIR }}
+            Expand-Archive openvino_package.zip -DestinationPath ./tmp
+            mv ./tmp/*/* .
+          popd
+
+      - name: Configure Developer Command Prompt for Microsoft Visual C++
+        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+
+      - name: Build genai libs
+        run: |
+          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
+
+      - name: Test bindings
+        run: |
+          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/tools --upgrade-strategy eager
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
+        env:
+          PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
+
+      - name: Test bindings (wheel)
+        run: |
+          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
+          python -m pip install . --verbose
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -11,6 +11,7 @@ add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
 add_subdirectory(cpp/benchmark_genai)
+add_subdirectory(cpp/whisper_speech_recognition)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)

diff --git a/samples/cpp/whisper_speech_recognition/CMakeLists.txt b/samples/cpp/whisper_speech_recognition/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+find_package(OpenVINOGenAI REQUIRED PATHS
+    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+)
+
+if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
+if(POLICY CMP0169)
+    cmake_policy(SET CMP0169 OLD)
+endif()
+
+include(FetchContent)
+
+if(NOT TARGET dr_libs)
+    FetchContent_Declare(dr_libs
+        URL https://github.com/mackron/dr_libs/archive/da35f9d6c7374a95353fd1df1d394d44ab66cf01.tar.gz
+        URL_HASH SHA256=2704d347f480ca1bc92233fb01747e4550cc8031735b6ea62ca9990ebb8851ae)
+    FetchContent_MakeAvailable(dr_libs)
+endif()
+
+add_executable(whisper_speech_recognition whisper_speech_recognition.cpp audio_utils.cpp)
+target_link_libraries(whisper_speech_recognition PRIVATE openvino::genai)
+target_include_directories(whisper_speech_recognition PRIVATE "$<BUILD_INTERFACE:${dr_libs_SOURCE_DIR}>")
+set_target_properties(whisper_speech_recognition PROPERTIES
+    COMPILE_PDB_NAME whisper_speech_recognition
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+target_compile_features(whisper_speech_recognition PRIVATE cxx_std_11)
+
+install(TARGETS whisper_speech_recognition
+        RUNTIME DESTINATION samples_bin/
+        COMPONENT samples_bin
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
@@ -0,0 +1,38 @@
+# Whisper automatic speech recognition sample
+
+This example showcases inference of speech recognition Whisper Models. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::WhisperPipeline` and uses audio file in wav format as an input source.
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base
+```
+
+## Prepare audio file
+
+Prepare audio file in wav format with sampling rate 16k Hz.
+
+## Run
+
+`whisper_speech_recognition whisper-base sample.wav`
+
+
+Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
+
+Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai).
+
+### Troubleshooting
+
+#### Empty or rubbish output
+
+Example output:
+```
+----------------
+```
+
+To resolve this ensure that audio data has 16k Hz sampling rate