-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_vectorstore.py
81 lines (65 loc) · 2.48 KB
/
create_vectorstore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""CREATE_VECTORSTORE.PY
This script creates a Faiss vectorstore from the faculty bios.
Dependencies: `dotenv`, 'faiss-cpu', `langchain`, `tqdm`
"""
# Library imports
import os
import time
import json
from tqdm import tqdm
from dotenv import load_dotenv
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load API key
load_dotenv(".env")
KEY = os.environ["HUGGINGFACEHUB_API_TOKEN"]
def get_data(path: str) -> list:
""" Returns list of processed LangChain Documents """
# Load documents
print("Loading documents...")
bios = json.load(open(path, "r"))
documents = []
none_count = 0
for b in bios:
if b["bio"] is not None:
documents.append(Document(page_content=b["bio"], metadata={
"name": b["name"],
"position": b["position"],
"email": b["email"],
"expertise": b["expertise"],
"profile": b["profile"],
}))
else:
none_count += 1
print(f"Loaded {len(documents)} documents. {none_count} documents skipped since bio was None")
print(f"Splitting {len(documents)} documents...")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
fragments = splitter.split_documents(documents)
return fragments
def get_vectorstore(fragments: list, model: HuggingFaceInferenceAPIEmbeddings,
save_path: str) -> FAISS:
""" Embeds documents and returns vectorstore """
# Create vectorstore (400 docs at a time, else HuggingFace API blocks)
print(f"Creating vectorstore with {len(fragments)} documents...")
dbs = []
for i in tqdm(range(0, len(fragments), 400)):
db = FAISS.from_documents(fragments[i : i+400], model)
db.save_local(f"{save_path}_{i}")
dbs.append(db)
time.sleep(5)
# Merge partial vectorstores and save final copy
for db in tqdm(dbs[1:]):
dbs[0].merge_from(db)
db.save_local(f"{save_path}_merged")
def main():
# Load embeddings model
model = HuggingFaceInferenceAPIEmbeddings(
api_key=KEY, model_name="sentence-transformers/all-MiniLM-l6-v2"
)
# Get vectorstore
fragments = get_data("data_new/json/bios.json")
get_vectorstore(fragments, model, "data_new/faiss/bios_faiss")
if __name__ == "__main__":
main()