兼容千问

CosmosShadow · Jul 10, 2024 · 999530a · 999530a
1 parent 66c7580
commit 999530a
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ Parses a PDF file into a Markdown file and returns the Markdown content along wi
 
 - **model**: *str*, default: 'gpt-4o'  
   OpenAI API formatted multimodal large model. If you need to use other models, such as:
-  - [qwen-vl-max](https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start) 
+  - [qwen-vl-max](https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope) 
   - [GLM-4V](https://open.bigmodel.cn/dev/api#glm-4v)
   - [Yi-Vision](https://platform.lingyiwanwu.com/docs) 
   - Azure OpenAI, by setting the `base_url` to `https://xxxx.openai.azure.com/` to use Azure OpenAI, where `api_key` is the Azure API key, and the model is similar to `azure_xxxx`, where `xxxx` is the deployed model name (tested).
@@ -125,11 +125,7 @@ Parses a PDF file into a Markdown file and returns the Markdown content along wi
       verbose=False,
   )
 
-## Version
-
-- 0.0.9~0.0.10: Optimize pdf parsing process, better parsing effect
-- 0.0.2 - 0.0.8: Add gpt_worker parameter；Add GLM-4V, Azure OpenAI support; Fix some bugs
-- 0.0.1: First version
+- **args"": LLM other parameters, such as `temperature`, `top_p`, `max_tokens`, `presence_penalty`, `frequency_penalty`, etc.
 
 ## Join Us 👏🏻
 

diff --git a/README_CN.md b/README_CN.md
@@ -61,7 +61,8 @@ def parse_pdf(
         base_url: Optional[str] = None,
         model: str = 'gpt-4o',
         verbose: bool = False,
-        gpt_worker: int = 1
+        gpt_worker: int = 1,
+        **args
 ) -> Tuple[str, List[str]]:
 ```
 
@@ -83,7 +84,7 @@ def parse_pdf(
   类接口的其他大模型服务，例如`GLM-4V`。
 
 - **model**：*str*，默认值：'gpt-4o'。OpenAI API 格式的多模态大模型。如果需要使用其他模型，例如
-    - [qwen-vl-max](https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)
+    - [qwen-vl-max](https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope)
     - [GLM-4V](https://open.bigmodel.cn/dev/api#glm-4v)
     - [Yi-Vision](https://platform.lingyiwanwu.com/docs)
     - Azure OpenAI，通过将 `base_url` 指定为 `https://xxxx.openai.azure.com/` 来使用 Azure OpenAI，`api_key` 是 Azure API
@@ -119,11 +120,8 @@ def parse_pdf(
   ```
   您不需要替换所有的提示词，如果您没有传入自定义提示词，仓库会自动使用默认的提示词。默认提示词使用的是中文，如果您的PDF文档是英文的，或者您的模型不支持中文，建议您自定义提示词。
 
-## 版本
+- **args"": LLM 中其他参数，例如 `temperature`，`max_tokens`, `top_p`, `frequency_penalty`, `presence_penalty` 等。
 
-- 0.0.9~0.0.10: 优化 PDF 解析流程，解析效果更好
-- 0.0.2 - 0.0.8: 添加 gpt_worker 参数；添加 GLM-4V、Azure OpenAI 支持；修复一些 bug
-- 0.0.1: 第一个版本
 
 ## 加入我们👏🏻
 

diff --git a/gptpdf/parse.py b/gptpdf/parse.py
@@ -181,7 +181,8 @@ def _gpt_parse_images(
         base_url: Optional[str] = None,
         model: str = 'gpt-4o',
         verbose: bool = False,
-        gpt_worker: int = 1
+        gpt_worker: int = 1,
+        **args
 ) -> str:
     """
     Parse images to markdown content.
@@ -209,12 +210,12 @@ def _gpt_parse_images(
 
     def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, str]:
         logging.info(f'gpt parse page: {index}')
-        agent = Agent(role=role_prompt, api_key=api_key, base_url=base_url, model=model, disable_python_run=True)
+        agent = Agent(role=role_prompt, api_key=api_key, base_url=base_url, model=model, **args)
         page_image, rect_images = image_info
         local_prompt = prompt
         if rect_images:
             local_prompt += rect_prompt + ', '.join(rect_images)
-        content = agent.run([local_prompt, {'image': page_image}], show_stream=verbose)
+        content = agent.run([local_prompt, {'image': page_image}], display=verbose)
         return index, content
 
     contents = [None] * len(image_infos)
@@ -247,7 +248,8 @@ def parse_pdf(
         base_url: Optional[str] = None,
         model: str = 'gpt-4o',
         verbose: bool = False,
-        gpt_worker: int = 1
+        gpt_worker: int = 1,
+        **args
 ) -> Tuple[str, List[str]]:
     """
     Parse a PDF file to a markdown file.
@@ -264,7 +266,8 @@ def parse_pdf(
         base_url=base_url,
         model=model,
         verbose=verbose,
-        gpt_worker=gpt_worker
+        gpt_worker=gpt_worker,
+        **args
     )
 
     all_rect_images = []

diff --git a/test/test.py b/test/test.py
@@ -43,11 +43,20 @@ def test_azure():
     content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True)
     print(content)
     print(image_paths)
-
 
+def test_qwen_vl_max():
+    from gptpdf import parse_pdf
+    api_key = 'sk-xxxx'
+    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+    # Refer to: https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope
+    model  = 'qwen-vl-max'
+    content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True, temperature=0.5, max_tokens=1000, top_p=0.9, frequency_penalty=1)
+    print(content)
+    print(image_paths)
 
 
 if __name__ == '__main__':
-    test_use_api_key()
+    # test_use_api_key()
     # test_use_env()
-    # test_azure()
+    # test_azure()
+    test_qwen_vl_max()