diff --git a/integrations/dense/test_encode.py b/integrations/dense/test_encode.py index fffc30524..9dbf736ea 100644 --- a/integrations/dense/test_encode.py +++ b/integrations/dense/test_encode.py @@ -16,6 +16,7 @@ """Integration tests for create dense index """ +import faiss import os import shutil import unittest @@ -23,7 +24,6 @@ from pyserini.search.lucene import LuceneImpactSearcher from urllib.request import urlretrieve - class TestSearchIntegration(unittest.TestCase): def setUp(self): curdir = os.getcwd() @@ -33,7 +33,7 @@ def setUp(self): self.pyserini_root = '.' self.temp_folders = [] self.corpus_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/corpus/jsonl/cacm.json' - self.corpus_path = f"{self.pyserini_root}/integrations/dense/temp_cacm/" + self.corpus_path = f'{self.pyserini_root}/integrations/dense/temp_cacm/' os.makedirs(self.corpus_path, exist_ok=True) self.temp_folders.append(self.corpus_path) urlretrieve(self.corpus_url, os.path.join(self.corpus_path, 'cacm.json')) @@ -41,33 +41,79 @@ def setUp(self): def test_dpr_encode_as_faiss(self): index_dir = f'{self.pyserini_root}/temp_index' self.temp_folders.append(index_dir) - cmd1 = f"python -m pyserini.encode input --corpus {self.corpus_path} \ + cmd1 = f'python -m pyserini.encode input --corpus {self.corpus_path} \ --fields text \ output --embeddings {index_dir} --to-faiss \ encoder --encoder facebook/dpr-ctx_encoder-multiset-base \ --fields text \ --batch 4 \ - --device cpu" + --device cpu' _ = os.system(cmd1) searcher = FaissSearcher( index_dir, 'facebook/dpr-question_encoder-multiset-base' ) - q_emb, hit = searcher.search("What is the solution of separable closed queueing networks?", k=1, return_vector=True) + q_emb, hit = searcher.search('What is the solution of separable closed queueing networks?', k=1, return_vector=True) self.assertEqual(hit[0].docid, 'CACM-2445') self.assertAlmostEqual(hit[0].vectors[0], -6.88267112e-01, places=4) self.assertEqual(searcher.num_docs, 3204) + def test_dpr_encode_as_faiss_search_with_partitions(self): + # Create two partitions of the CACM index, search them individually, and merge results to compute top hit + index_dir = f'{self.pyserini_root}/temp_index' + os.makedirs(os.path.join(index_dir, 'partition1'), exist_ok=True) + os.makedirs(os.path.join(index_dir, 'partition2'), exist_ok=True) + self.temp_folders.append(index_dir) + cmd1 = f'python -m pyserini.encode input --corpus {self.corpus_path} \ + --fields text \ + output --embeddings {index_dir} --to-faiss \ + encoder --encoder facebook/dpr-ctx_encoder-multiset-base \ + --fields text \ + --batch 4 \ + --device cpu' + _ = os.system(cmd1) + index = faiss.read_index(os.path.join(index_dir, 'index')) + new_index_partition1 = faiss.IndexFlatIP(index.d) + new_index_partition2 = faiss.IndexFlatIP(index.d) + vectors_partition1 = index.reconstruct_n(0, index.ntotal // 2) + vectors_partition2 = index.reconstruct_n(index.ntotal // 2, index.ntotal - index.ntotal // 2) + new_index_partition1.add(vectors_partition1) + new_index_partition2.add(vectors_partition2) + + faiss.write_index(new_index_partition1, os.path.join(index_dir, 'partition1/index')) + faiss.write_index(new_index_partition2, os.path.join(index_dir, 'partition2/index')) + + with open(os.path.join(index_dir, 'partition1/docid'), 'w') as docid1, open(os.path.join(index_dir, 'partition2/docid'), 'w') as docid2: + with open(os.path.join(index_dir, 'docid'), 'r') as file: + for i in range(index.ntotal): + line = next(file) + if i < (index.ntotal // 2): + docid1.write(line) + else: + docid2.write(line) + + searcher_partition1 = FaissSearcher(index_dir + '/partition1','facebook/dpr-question_encoder-multiset-base') + searcher_partition2 = FaissSearcher(index_dir + '/partition2','facebook/dpr-question_encoder-multiset-base') + q_emb, hit1 = searcher_partition1.search('What is the solution of separable closed queueing networks?', k=2, return_vector=True) + q_emb, hit2 = searcher_partition2.search('What is the solution of separable closed queueing networks?', k=2, return_vector=True) + merged_hits = hit1 + hit2 + merged_hits.sort(key=lambda x: x.score, reverse=True) + + self.assertEqual(merged_hits[0].docid, 'CACM-2445') + self.assertAlmostEqual(merged_hits[0].vectors[0], -6.88267112e-01, places=4) + self.assertEqual(searcher_partition1.num_docs, 1602) + self.assertEqual(searcher_partition2.num_docs, 1602) + def test_unicoil_encode_as_jsonl(self): embedding_dir = f'{self.pyserini_root}/temp_embeddings' self.temp_folders.append(embedding_dir) - cmd1 = f"python -m pyserini.encode input --corpus {self.corpus_path} \ + cmd1 = f'python -m pyserini.encode input --corpus {self.corpus_path} \ --fields text \ output --embeddings {embedding_dir} \ encoder --encoder castorini/unicoil-msmarco-passage \ --fields text \ --batch 4 \ - --device cpu" + --device cpu' _ = os.system(cmd1) index_dir = f'{self.pyserini_root}/temp_lucene' self.temp_folders.append(index_dir) @@ -78,7 +124,7 @@ def test_unicoil_encode_as_jsonl(self): -impact -pretokenized -threads 12 -storeRaw' _ = os.system(cmd2) searcher = LuceneImpactSearcher(index_dir, query_encoder='castorini/unicoil-msmarco-passage') - hits = searcher.search("What is the solution of separable closed queueing networks?", k=1) + hits = searcher.search('What is the solution of separable closed queueing networks?', k=1) hit = hits[0] self.assertEqual(hit.docid, 'CACM-2712') self.assertAlmostEqual(hit.score, 18.401899337768555, places=4)