-
Notifications
You must be signed in to change notification settings - Fork 0
/
calculate_chunk_size.py
26 lines (22 loc) · 1.11 KB
/
calculate_chunk_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (UnstructuredFileLoader,
PyPDFLoader,
UnstructuredPDFLoader,
PDFPlumberLoader,
PDFMinerLoader,
PyMuPDFLoader,
PyPDFium2Loader)
from chinese_text_splitter import ChineseTextSplitter
filepath = ""
# 加载PDF文件
# loader = PyMuPDFLoader(file_path=filepath)
loader = UnstructuredPDFLoader(file_path=filepath)
# splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0, separators=["\n\n", " \n \n", "\n \n",]) # 设置一个非常大的chunk_size
splitter = ChineseTextSplitter(chunk_size=200, chunk_overlap=0, pdf=True)
docs = loader.load_and_split(splitter)
# 计算最大的chunk_size
# max_chunk_size = max(len(chunk) for chunk in docs)
print(docs)
print(len(docs))
# print(f"最大的chunk_size是:{max_chunk_size}")