MOD: Mod readme.

NetEase-Media · Oct 28, 2024 · ac58d92 · ac58d92
1 parent 2014a9c
commit ac58d92
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 2 deletions.
diff --git a/conf/inference_internvl2-8B.yml b/conf/inference_internvl2-8B.yml
@@ -109,7 +109,7 @@ models:
       max_attention_window_size: # use default (i.e. max_sequence_length) if not set.
       sink_token_length: # use default if not set.
       batch_scheduler_policy: guaranteed_no_evict # must be `max_utilization` or `guaranteed_no_evict`.
-      kv_cache_free_gpu_mem_fraction: 0.6 # will be set to 0.9 or `max_tokens_in_paged_kv_cache` if not set.
+      kv_cache_free_gpu_mem_fraction: 0.7 # will be set to 0.9 or `max_tokens_in_paged_kv_cache` if not set.
       kv_cache_host_memory_bytes: # will be set to 0 if not set.
       kv_cache_onboard_blocks: # will be set to true if not set.
       exclude_input_in_output: true # will be set to false if not set.

diff --git a/docs/internvl2.md b/docs/internvl2.md
@@ -42,11 +42,12 @@ python3 tools/internvl2/convert_internlm2_ckpt.py --model_dir /tmp/InternVL2-8B/
 # 这里设置支持最大batch_size为2，即支持2个并发同时推理，超过两个排队处理。
 # 设置每个请求最多输入26个图片patch（InternVL2中每个图片根据不同的尺寸最多产生13个patch）。
 # 即：max_multimodal_len=2（max_batch_size） * 26（图片最多产生patch个数） * 256（每个patch对应256个token） = 13312
+# 设置max_input_len为32k，max_seq_len为36k（即最大输出为4k）。
 rm -rf /tmp/InternVL2-8B/trt_engines/
 trtllm-build --checkpoint_dir /tmp/InternVL2-8B/tllm_checkpoint/ \
 --output_dir /tmp/InternVL2-8B/trt_engines/ \
 --gemm_plugin bfloat16 --max_batch_size 2 --paged_kv_cache enable \
---max_input_len 32768 --max_seq_len 60416 --max_num_tokens 32768 --max_multimodal_len 13312
+--max_input_len 32768 --max_seq_len 36960 --max_num_tokens 32768 --max_multimodal_len 13312
 
 # 构建vit引擎，设置--maxBS为26可以同时处理26个图片patch（InternVL2中每个图片根据不同的尺寸最多产生13个patch）。
 python3 tools/internvl2/build_vit_engine.py --pretrainedModelPath /tmp/InternVL2-8B \
@@ -467,6 +468,18 @@ ChatCompletion(id='chatcmpl-9', choices=[Choice(finish_reason='stop', index=0, l
 '
 ```
 
+## 开启gradio服务
+
+![gradio.png](gradio.png)
+
+```bash
+# 安装gradio
+pip install -r tools/gradio/requirements.txt
+
+# 启动多模态聊天界面，使用internvl2多模态模型，0.0.0.0:9997表示llm后端服务地址
+python3 tools/gradio/llm_app.py internvl2 0.0.0.0:9997
+```
+
 ## 关闭服务
 
 ```bash

diff --git a/src/utils.cc b/src/utils.cc
@@ -500,6 +500,7 @@ std::tuple<bool, std::string, executor::Request> CreateRequestFromOpenAiHttpBody
   rapidjson::Document json_body;
   json_body.Parse(http_body.c_str());
   if (json_body.HasParseError()) {
+    CLOG4(ERROR, "Parse http json body failed: " << json_body.GetParseError() << ", body: " << http_body);
     throw std::invalid_argument("Parse http json body failed.");
   }