""" # Example: Retrieve from pre-built index of SciFact This script shows how to load an index built with BM25.index and saved with BM25.save, and retrieve the top-k results for a set of queries from the SciFact dataset, via the BEIR library. """ import shutil import tempfile import beir.util from beir.datasets.data_loader import GenericDataLoader import Stemmer import bm25s from bm25s.utils.beir import BASE_URL from bm25s.tokenization import Tokenizer, Tokenized def main(data_dir="datasets", dataset="scifact"): # Load the queries from BEIR data_path = beir.util.download_and_unzip(BASE_URL.format(dataset), data_dir) loader = GenericDataLoader(data_folder=data_path) corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split='test') corpus_lst = [doc["title"] + " " + doc["text"] for doc in corpus.values()] queries_lst = list(queries.values()) # Initialize the stemmer stemmer = Stemmer.Stemmer("english") # Initialize the Tokenizer with the stemmer tokenizer = Tokenizer( stemmer=stemmer, lower=True, # lowercase the tokens stopwords="english", # or pass a list of stopwords splitter=r"\w+", # by default r"(?u)\b\w\w+\b", can also be a function ) # Tokenize the corpus corpus_tokenized = tokenizer.tokenize( corpus_lst, update_vocab=True, # update the vocab as we tokenize return_as="ids" ) # stream tokenizing the queries, without updating the vocabulary # note: this cannot return as string due to the streaming nature tokenizer_stream = tokenizer.streaming_tokenize( queries_lst, update_vocab=False ) query_ids = [] for q in tokenizer_stream: # you can do something with the ids here, e.g. retrieve from the index if 1 in q: query_ids.append(q) # you can convert the ids to a Tokenized namedtuple ids and tokens... res = tokenizer.to_tokenized_tuple(query_ids) # ... which is equivalent to: # tokenizer.tokenize(your_query_lst, return_as="tuple", update_vocab=False) # You can verify the results assert res.ids == query_ids assert res.vocab == tokenizer.get_vocab_dict() assert isinstance(res, Tokenized) # You can also get strings query_strs = tokenizer.decode(query_ids) # ... which is equivalent to: # tokenizer.tokenize(your_query_lst, return_as="string", update_vocab=False) # let's verify the results assert isinstance(query_strs, list) assert isinstance(query_strs[0], list) assert isinstance(query_strs[0][0], str) # Let's see how it's all used retriever = bm25s.BM25() retriever.index(corpus_tokenized, leave_progress=False) # all of the above can be passed to index a bm25s model # e.g. using the ids directly results, scores = retriever.retrieve(query_ids, k=3) # or passing the strings results, scores = retriever.retrieve(query_strs, k=3) # or passing the Tokenized namedtuple results, scores = retriever.retrieve(res, k=3) # or passing a tuple of ids and vocab dict vocab_dict = tokenizer.get_vocab_dict() results, scores = retriever.retrieve((query_ids, vocab_dict), k=3) # If you want, you can save the vocab and stopwords, it can be the same dir as your index your_index_dir = tempfile.mkdtemp() tokenizer.save_vocab(save_dir=your_index_dir) # Unhappy with your vocab? you can reset your tokenizer tokenizer.reset_vocab() # loading: new_tokenizer = Tokenizer( stemmer=stemmer, lower=True, stopwords=[], splitter=r"\w+", ) print("Vocabulary size before reloading:", len(new_tokenizer.get_vocab_dict())) new_tokenizer.load_vocab(your_index_dir) print("Vocabulary size after reloading:", len(new_tokenizer.get_vocab_dict())) # the same can be done for stopwords print("stopwords before reloading:", new_tokenizer.stopwords) tokenizer.save_stopwords(save_dir=your_index_dir) new_tokenizer.load_stopwords(your_index_dir) print("stopwords after reloaded:", new_tokenizer.stopwords) # cleanup shutil.rmtree(your_index_dir) if __name__ == "__main__": main()