Skip to content

FEAT: Xavier: Share KV cache between VLLM replicas #5323

FEAT: Xavier: Share KV cache between VLLM replicas

FEAT: Xavier: Share KV cache between VLLM replicas #5323

Workflow file for this run

name: Python CI
on:
push:
branches:
- '*'
pull_request:
types: ['opened', 'reopened', 'synchronize']
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ "ubuntu-latest" ]
python-version: [ "3.10" ]
steps:
- name: Check out code
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
- name: Set up Python environment
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: flake8 Lint
uses: py-actions/flake8@v2
with:
path: "xinference"
args: "--config setup.cfg"
- name: black
uses: psf/black@stable
with:
src: "xinference"
options: "--check"
version: "23.12.0"
- uses: isort/isort-action@master
with:
sortPaths: "xinference"
configuration: "--check-only --diff --sp setup.cfg"
- name: mypy
run: pip install mypy && mypy --install-types --non-interactive xinference
- name: codespell
run: pip install codespell && codespell --ignore-words-list thirdparty xinference
- name: Set up Node.js
uses: actions/setup-node@v1
with:
node-version: 16
# ESLint and Prettier must be in `package.json`
- name: Install Node.js dependencies
run: cd xinference/web/ui && npm ci
- name: ESLint Check
run: cd xinference/web/ui && npx eslint .
- name: Prettier Check
run: cd xinference/web/ui && ./node_modules/.bin/prettier --check .
build_test_job:
runs-on: ${{ matrix.os }}
needs: lint
env:
CONDA_ENV: test
SELF_HOST_PYTHON: /root/miniconda3/envs/inference_test/bin/python
SELF_HOST_CONDA: /root/miniconda3/condabin/conda
defaults:
run:
shell: bash -l {0}
strategy:
fail-fast: false
matrix:
os: [ "ubuntu-latest", "macos-13", "windows-latest" ]
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
module: [ "xinference" ]
exclude:
- { os: macos-13, python-version: 3.10 }
- { os: macos-13, python-version: 3.11 }
- { os: windows-latest, python-version: 3.10 }
- { os: windows-latest, python-version: 3.11 }
include:
- { os: self-hosted, module: gpu, python-version: 3.9}
- { os: macos-latest, module: metal, python-version: "3.10" }
steps:
- name: Check out code
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
- name: Set up conda ${{ matrix.python-version }}
uses: conda-incubator/setup-miniconda@v3
if: ${{ matrix.module != 'gpu' }}
with:
python-version: ${{ matrix.python-version }}
activate-environment: ${{ env.CONDA_ENV }}
# Important for python == 3.12
- name: Update pip and setuptools
if: ${{ matrix.python-version == '3.12' }}
run: |
python -m pip install -U pip setuptools
- name: Install numpy
if: |
startsWith(matrix.os, 'macos') && matrix.python-version == '3.12'
run: |
python -m pip install "numpy<2"
- name: Install dependencies
env:
MODULE: ${{ matrix.module }}
OS: ${{ matrix.os }}
if: ${{ matrix.module != 'gpu' }}
run: |
if [ "$OS" == "ubuntu-latest" ]; then
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
fi
pip install -e ".[dev]"
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install mlx-lm
pip install mlx-vlm
pip install mlx-whisper
pip install f5-tts-mlx
pip install qwen-vl-utils
pip install tomli
else
pip install "llama-cpp-python==0.2.77" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
pip install transformers
pip install attrdict
pip install "timm>=0.9.16"
pip install torch torchvision
pip install accelerate
pip install sentencepiece
pip install transformers_stream_generator
pip install bitsandbytes
pip install "sentence-transformers>=2.3.1"
pip install modelscope
pip install diffusers
pip install protobuf
pip install FlagEmbedding
pip install "tenacity>=8.2.0,<8.4.0"
pip install "jinja2==3.1.2"
pip install tensorizer
pip install jj-pytorchvideo
pip install qwen-vl-utils
pip install datamodel_code_generator
pip install jsonschema
fi
working-directory: .
- name: Test with pytest
env:
MODULE: ${{ matrix.module }}
run: |
if [ "$MODULE" == "gpu" ]; then
${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U modelscope
${{ env.SELF_HOST_PYTHON }} -m pip install -U sse_starlette
${{ env.SELF_HOST_PYTHON }} -m pip install -U xoscar
${{ env.SELF_HOST_PYTHON }} -m pip install -U "python-jose[cryptography]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "passlib[bcrypt]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "aioprometheus[starlette]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "pynvml"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "transformers"
${{ env.SELF_HOST_CONDA }} install -c conda-forge pynini=2.1.5
${{ env.SELF_HOST_CONDA }} install -c conda-forge "ffmpeg<7"
${{ env.SELF_HOST_PYTHON }} -m pip install "funasr<1.1.17"
${{ env.SELF_HOST_PYTHON }} -m pip install -U nemo_text_processing<1.1.0
${{ env.SELF_HOST_PYTHON }} -m pip install -U omegaconf~=2.3.0
${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2.1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'
${{ env.SELF_HOST_PYTHON }} -m pip install -U openai-whisper
${{ env.SELF_HOST_PYTHON }} -m pip install -U "torch==2.3.1" "torchaudio==2.3.1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loguru"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "natsort"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loralib"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ormsgpack"
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y opencc
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
${{ env.SELF_HOST_PYTHON }} -m pip install -U accelerate
${{ env.SELF_HOST_PYTHON }} -m pip install -U verovio
${{ env.SELF_HOST_PYTHON }} -m pip install -U cachetools
${{ env.SELF_HOST_PYTHON }} -m pip install -U silero-vad
${{ env.SELF_HOST_PYTHON }} -m pip install -U pydantic
${{ env.SELF_HOST_PYTHON }} -m pip install -U diffusers
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnx
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxconverter_common
${{ env.SELF_HOST_PYTHON }} -m pip install -U torchdiffeq
${{ env.SELF_HOST_PYTHON }} -m pip install -U "x_transformers>=1.31.14"
${{ env.SELF_HOST_PYTHON }} -m pip install -U pypinyin
${{ env.SELF_HOST_PYTHON }} -m pip install -U tomli
${{ env.SELF_HOST_PYTHON }} -m pip install -U vocos
${{ env.SELF_HOST_PYTHON }} -m pip install -U jieba
${{ env.SELF_HOST_PYTHON }} -m pip install -U soundfile
${{ env.SELF_HOST_PYTHON }} -m pip install -U sentence-transformers
${{ env.SELF_HOST_PYTHON }} -m pip install -U FlagEmbedding
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
--disable-warnings \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/core/tests/test_continuous_batching.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_stable_diffusion.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_got_ocr2.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_funasr.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_chattts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_cosyvoice.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_f5tts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_f5tts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_fish_speech.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/embedding/tests/test_integrated_embedding.py
elif [ "$MODULE" == "metal" ]; then
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py && \
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper_mlx.py && \
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_f5tts_mlx.py
else
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/client/tests/test_client.py
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/core/tests/test_continuous_batching.py --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/image/tests/test_got_ocr2.py --ignore xinference/model/audio/tests --ignore xinference/model/embedding/tests/test_integrated_embedding.py xinference
fi
working-directory: .