From 0dd5fa7e94d7c275c5abd3a35acf64fbeb3013fb Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Sat, 31 Aug 2024 10:53:20 -0400 Subject: [PATCH] Add bindings to Lucene's flat dense searcher: single thread ONNX on BEIR (#1973) --- docs/prebuilt-indexes.md | 126 +++++- pyserini/prebuilt_index_info.py | 386 +++++------------- ...bge-base-en-v1.5.20240618.6cf601.README.md | 20 + ...bge-base-en-v1.5.20240223.43c9ec.README.md | 2 +- pyserini/search/lucene/__init__.py | 3 +- pyserini/search/lucene/__main__.py | 23 +- pyserini/search/lucene/_hnsw_searcher.py | 105 ++++- pyserini/util.py | 8 +- .../generate_docs_from_prebuilt_indexes.py | 9 +- tests/test_lucene_dense_search.py | 78 +++- tests/test_prebuilt_index.py | 23 +- 11 files changed, 478 insertions(+), 305 deletions(-) create mode 100644 pyserini/resources/index-metadata/lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md index 3e136314b..bbb5db24d 100644 --- a/docs/prebuilt-indexes.md +++ b/docs/prebuilt-indexes.md @@ -793,7 +793,7 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse -## Lucene Standard Impact Indexes +## Lucene Impact Indexes
MS MARCO
@@ -1125,6 +1125,130 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
+## Lucene Flat Indexes +
+BEIR +
+
beir-v1.0.0-trec-covid.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'trec-covid' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-bioasq.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'bioasq' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-nfcorpus.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'nfcorpus' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-nq.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'nq' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-hotpotqa.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'hotpotqa' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-fiqa.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'fiqa' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-signal1m.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'signal1m' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-trec-news.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'trec-news' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-robust04.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'robust04' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-arguana.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'arguana' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'webis-touche2020' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-android' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-english' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-gaming' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-gis' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-mathematica' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-physics' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-programmers' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-stats' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-tex' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-unix' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-webmasters' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'cqadupstack-wordpress' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-quora.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'quora' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'dbpedia-entity' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-scidocs.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'scidocs' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-fever.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'fever' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-climate-fever.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'climate-fever' encoded by BGE-base-en-v1.5. +
+
beir-v1.0.0-scifact.bge-base-en-v1.5.flat +[readme] +
Lucene flat index of BEIR collection 'scifact' encoded by BGE-base-en-v1.5. +
+
+
+ + ## Faiss Indexes
MS MARCO diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index ed8b84d73..b06964b5c 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -14,6 +14,25 @@ # limitations under the License. # +from pyserini.pyclass import autoclass + +JIndexInfo = autoclass('io.anserini.index.IndexInfo') + + +def add_lucene_index_info(enum, info, readme): + info[enum.indexName] = { + "description": enum.description, + "filename": enum.filename, + "readme": readme, + "urls": [ + enum.urls[0] + ], + "md5": enum.md5, + "downloaded": False + } + + +# Bindings for Lucene (standard) inverted indexes TF_INDEX_INFO_MSMARCO = { # MS MARCO V1 document corpus, three indexes with different amounts of information (and sizes). "msmarco-v1-doc": { @@ -2637,6 +2656,8 @@ **TF_INDEX_INFO_OTHER, **TF_INDEX_INFO_OTHER_ALIASES} + +# Bindings for Lucene impact indexes IMPACT_INDEX_INFO_MSMARCO = { "msmarco-v1-passage.slimr": { "description": "Lucene impact index of the MS MARCO V1 passage corpus enoded by SLIM trained with BM25 negatives.", @@ -3343,301 +3364,82 @@ **IMPACT_INDEX_INFO_MSMARCO_ALIASES, **IMPACT_INDEX_INFO_BEIR} -LUCENE_HNSW_INDEX_INFO_BEIR = { - "beir-v1.0.0-trec-covid.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'trec-covid' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-trec-covid.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-trec-covid.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "2c8cba8525f8ec6920dbb4f0b4a2e0a6", - "downloaded": False - }, - "beir-v1.0.0-bioasq.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'bioasq' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-bioasq.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-bioasq.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "2f4cde27ef5ec3be1193e06854fdaae6", - "downloaded": False - }, - "beir-v1.0.0-nfcorpus.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'nfcorpus' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "d0aa34bf35b59466e7064c424dd82e2c", - "downloaded": False - }, - "beir-v1.0.0-nq.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'nq' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-nq.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-nq.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "b0bbd85821c734125ffbc0f7ea8f75ae", - "downloaded": False - }, - "beir-v1.0.0-hotpotqa.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'hotpotqa' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "83129157f2138a2240b69f8f5404e579", - "downloaded": False - }, - "beir-v1.0.0-fiqa.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'fiqa' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-fiqa.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-fiqa.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "f2e3191b9d047b88b4692ec3ac87acd0", - "downloaded": False - }, - "beir-v1.0.0-signal1m.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'signal1m' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-signal1m.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-signal1m.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "86a5dc12806c5e2f5f1e7cf646ef9004", - "downloaded": False - }, - "beir-v1.0.0-trec-news.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'trec-news' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-trec-news.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-trec-news.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "fcb8fae8c46c76931bde0ad51ecb86f8", - "downloaded": False - }, - "beir-v1.0.0-robust04.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'robust04' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-robust04.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-robust04.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "1b975602bf6b87e0a5815a254eb6e945", - "downloaded": False - }, - "beir-v1.0.0-arguana.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'arguana' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "468129157636526a3e96bc9427d62808", - "downloaded": False - }, - "beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'webis-touche2020' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "4639db80366f755bb552ce4c736c4aea", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-android' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "f7e1f2e737756a84b0273794dcb1038f", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-english' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "fcdb3fc633b2ca027111536ba422aaed", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-gaming' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "d59b216b3df6eb1b724e2f20ceb14407", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-gis' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "1dd42a28e388b30f42ede02565d445ca", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-mathematica' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "cda37cb1893409c67908cf3aab1467fe", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-physics' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "82f71e086930c7d8c5fe423173b9bc2e", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-programmers' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "a7a8e17dcef7b40fde2492436aab1458", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-stats' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "7a304fa64332256976bed5049392605b", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-tex' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "bc5b41b294528611982615c0fcb7ebc7", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-unix' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "e42e7b6f46239211f9e9a3ed521d30eb", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-webmasters' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "21987ab658ba062397095226eb62aaf1", - "downloaded": False - }, - "beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'cqadupstack-wordpress' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "4e80be8087e8f282c42c2b57e377bb65", - "downloaded": False - }, - "beir-v1.0.0-quora.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'quora' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-quora.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-quora.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "064d785db557b011649d5f8b07237eb4", - "downloaded": False - }, - "beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'dbpedia-entity' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "323d47f84a54894ba5e6ca215999a533", - "downloaded": False - }, - "beir-v1.0.0-scidocs.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'scidocs' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-scidocs.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-scidocs.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "50668564faa9723160b1dba37afbf6d9", - "downloaded": False - }, - "beir-v1.0.0-fever.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'fever' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-fever.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-fever.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "33f67e73786a41b454bf88ac2a7c21c7", - "downloaded": False - }, - "beir-v1.0.0-climate-fever.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'climate-fever' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-climate-fever.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-climate-fever.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "412337f9f8182e8ec6417bc3cd48288f", - "downloaded": False - }, - "beir-v1.0.0-scifact.bge-base-en-v1.5.hnsw": { - "description": "Lucene HNSW index of BEIR collection 'scifact' encoded by BGE-base-en-v1.5.", - "filename": "lucene-hnsw.beir-v1.0.0-scifact.bge-base-en-v1.5.20240223.43c9ec.tar.gz", - "readme": "lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-hnsw.beir-v1.0.0-scifact.bge-base-en-v1.5.20240223.43c9ec.tar.gz" - ], - "md5": "6de5a41a301575933fa9932f9ecb404d", - "downloaded": False - }, -} + +# Bindings for Lucene HNSW indexes +LUCENE_HNSW_INDEX_INFO_BEIR = {} + +# Metadata have already been defined in Anserini, just copy over into Pyserini. +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_TREC_COVID_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_BIOASQ_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_NFCORPUS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_NQ_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_HOTPOTQA_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_FIQA_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SIGNAL1M_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_TREC_NEWS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_ROBUST04_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_ARGUANA_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_WEBIS_TOUCHE2020_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_ANDROID_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_GAMING_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_GIS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_STATS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_TEX_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_UNIX_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_QUORA_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_DBPEDIA_ENTITY_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SCIDOCS_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_FEVER_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CLIMATE_FEVER_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SCIFACT_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR, 'lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md') LUCENE_HNSW_INDEX_INFO = {**LUCENE_HNSW_INDEX_INFO_BEIR} + +# Bindings for Lucene flat indexes +LUCENE_FLAT_INDEX_INFO_BEIR = {} + +# Metadata have already been defined in Anserini, just copy over into Pyserini. +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_TREC_COVID_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_BIOASQ_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_NFCORPUS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_NQ_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_HOTPOTQA_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_FIQA_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SIGNAL1M_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_TREC_NEWS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_ROBUST04_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_ARGUANA_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_WEBIS_TOUCHE2020_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_ANDROID_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_GAMING_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_GIS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_STATS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_TEX_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_UNIX_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_QUORA_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_DBPEDIA_ENTITY_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SCIDOCS_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_FEVER_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CLIMATE_FEVER_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') +add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SCIFACT_BGE_BASE_EN_15_FLAT, LUCENE_FLAT_INDEX_INFO_BEIR, 'lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md') + +LUCENE_FLAT_INDEX_INFO = {**LUCENE_FLAT_INDEX_INFO_BEIR} + + +# Bindings for Faiss indexes FAISS_INDEX_INFO_MSMARCO = { "msmarco-v1-passage.cosdpr-distil": { "description": "Faiss flat index of the MS MARCO passage corpus encoded by cosDPR-distil.", diff --git a/pyserini/resources/index-metadata/lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md b/pyserini/resources/index-metadata/lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md new file mode 100644 index 000000000..dc364de03 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-flat.beir-v1.0.0.bge-base-en-v1.5.20240618.6cf601.README.md @@ -0,0 +1,20 @@ +# beir-v1.0.0.bge-base-en-v1.5 (Flat) + +Lucene flat dense vector indexes of BEIR corpora using BGE-base-en-v1.5. + +These indexes were built 2024/06/18 on `orca` at Anserini commit [`6cf601`](https://github.com/castorini/anserini/commit/6e9ce8f56d08f9c72746b79f14208e45e3b7a81e) (2024/06/17), with Lucene 9.9.1. + +Here's the indexing command for `arguana`: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /store/collections/beir-v1.0.0/bge-base-en-v1.5/arguana \ + -generator DenseVectorDocumentGenerator \ + -index indexes/lucene-flat.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ + -threads 16 -optimize \ + >& logs/log.flat.beir-v1.0.0-arguana.bge-base-en-v1.5.txt & +``` + +And the same for all the other collections. +Note that the indexes are optimized, i.e., merged down to a single segment. diff --git a/pyserini/resources/index-metadata/lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md b/pyserini/resources/index-metadata/lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md index e0d1897e9..a0b58d09b 100644 --- a/pyserini/resources/index-metadata/lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md +++ b/pyserini/resources/index-metadata/lucene-hnsw.beir-v1.0.0.bge-base-en-v1.5.20240223.43c9ec.README.md @@ -1,4 +1,4 @@ -# beir-v1.0.0.bge-base-en-v1.5 +# beir-v1.0.0.bge-base-en-v1.5 (HNSW) Lucene HNSW indexes of BEIR corpora using BGE-base-en-v1.5. diff --git a/pyserini/search/lucene/__init__.py b/pyserini/search/lucene/__init__.py index a7494118d..dd570c557 100644 --- a/pyserini/search/lucene/__init__.py +++ b/pyserini/search/lucene/__init__.py @@ -17,7 +17,7 @@ from ._geo_searcher import LuceneGeoSearcher from ._impact_searcher import JScoredDoc, LuceneImpactSearcher, SlimSearcher from ._searcher import JScoredDoc, LuceneSimilarities, LuceneFusionSearcher, LuceneSearcher -from ._hnsw_searcher import LuceneHnswDenseSearcher +from ._hnsw_searcher import LuceneHnswDenseSearcher, LuceneFlatDenseSearcher __all__ = ['JScoredDoc', 'LuceneFusionSearcher', @@ -25,5 +25,6 @@ 'LuceneImpactSearcher', 'LuceneSearcher', 'LuceneHnswDenseSearcher', + 'LuceneFlatDenseSearcher', 'SlimSearcher', 'LuceneSimilarities'] diff --git a/pyserini/search/lucene/__main__.py b/pyserini/search/lucene/__main__.py index d85cf6bb2..e3655e6a7 100644 --- a/pyserini/search/lucene/__main__.py +++ b/pyserini/search/lucene/__main__.py @@ -22,10 +22,9 @@ from pyserini.analysis import JDefaultEnglishAnalyzer, JWhiteSpaceAnalyzer from pyserini.output_writer import OutputFormat, get_output_writer -from pyserini.pyclass import autoclass from pyserini.query_iterator import get_query_iterator, TopicsFormat from pyserini.search import JDisjunctionMaxQueryGenerator -from . import LuceneImpactSearcher, LuceneSearcher, SlimSearcher, LuceneHnswDenseSearcher +from . import LuceneImpactSearcher, LuceneSearcher, SlimSearcher, LuceneHnswDenseSearcher, LuceneFlatDenseSearcher from .reranker import ClassifierType, PseudoRelevanceClassifierReranker @@ -156,7 +155,23 @@ def define_search_args(parser): topics = query_iterator.topics if args.dense: - searcher = LuceneHnswDenseSearcher(args.index, ef_search=args.ef_search, encoder=args.onnx_encoder) + # Note that it's not actually necessary to check if it's a prebuilt index or an index location; + # The underlying Lucene search has the download prebuilt indexes transparently. + # Nevertheless, we still do this to be explicit and have Python manage all index downloads. + if os.path.exists(args.index): + if args.hnsw: + searcher = LuceneHnswDenseSearcher(args.index, ef_search=args.ef_search, encoder=args.onnx_encoder) + elif args.flat: + searcher = LuceneFlatDenseSearcher(args.index, encoder=args.onnx_encoder) + else: + raise ValueError(f'Unrecognized dense vector index type: must set either --hnsw or --flat') + else: + if args.hnsw: + searcher = LuceneHnswDenseSearcher.from_prebuilt_index(args.index, ef_search=args.ef_search, encoder=args.onnx_encoder) + elif args.flat: + searcher = LuceneFlatDenseSearcher.from_prebuilt_index(args.index, encoder=args.onnx_encoder) + else: + raise ValueError(f'Unrecognized dense vector index type: must set either --hnsw or --flat') elif not args.impact: if os.path.exists(args.index): # create searcher from index directory @@ -296,6 +311,8 @@ def define_search_args(parser): text = text.join(toks) if args.batch_size <= 1 and args.threads <= 1: if args.dense: + # Both LuceneHnswDenseSearcher and LuceneFlatDenseSearcher are called the same way, + # so we don't need to differentiate. hits = searcher.search(text, args.hits) elif args.impact: hits = searcher.search(text, args.hits, fields=fields) diff --git a/pyserini/search/lucene/_hnsw_searcher.py b/pyserini/search/lucene/_hnsw_searcher.py index f04bc7948..68be64a1a 100644 --- a/pyserini/search/lucene/_hnsw_searcher.py +++ b/pyserini/search/lucene/_hnsw_searcher.py @@ -25,6 +25,10 @@ # Wrappers around Anserini classes JHnswDenseSearcher = autoclass('io.anserini.search.HnswDenseSearcher') JHnswDenseSearcherArgs = autoclass('io.anserini.search.HnswDenseSearcher$Args') + +JFlatDenseSearcher = autoclass('io.anserini.search.FlatDenseSearcher') +JFlatDenseSearcherArgs = autoclass('io.anserini.search.FlatDenseSearcher$Args') + JScoredDoc = autoclass('io.anserini.search.ScoredDoc') @@ -52,7 +56,7 @@ def __init__(self, index_dir: str, ef_search=100, encoder=None, prebuilt_index_n self.prebuilt_index_name = prebuilt_index_name @classmethod - def from_prebuilt_index(cls, prebuilt_index_name: str, encoder=None, verbose=False): + def from_prebuilt_index(cls, prebuilt_index_name: str, ef_search=100, encoder=None, verbose=False): """Build a searcher from a prebuilt index; download the index if necessary. Parameters @@ -81,6 +85,105 @@ def from_prebuilt_index(cls, prebuilt_index_name: str, encoder=None, verbose=Fal if verbose: print(f'Initializing {prebuilt_index_name}...') + return cls(index_dir, ef_search=ef_search, encoder=encoder, prebuilt_index_name=prebuilt_index_name) + + def search(self, q: str, k: int = 10) -> List[JScoredDoc]: + """Search the collection. + + Parameters + ---------- + q : str + Query string. + k : int + Number of hits to return. + + Returns + ------- + List[JScoredDoc] + List of search results. + """ + + return self.searcher.search(q, k) + + def batch_search(self, queries: List[str], qids: List[str], k: int = 10, threads: int = 1) -> Dict[str, List[JScoredDoc]]: + """Search the collection concurrently for multiple queries, using multiple threads. + + Parameters + ---------- + queries : List[str] + List of query strings. + qids : List[str] + List of corresponding query ids. + k : int + Number of hits to return. + threads : int + Maximum number of threads to use. + + Returns + ------- + Dict[str, List[JScoredDoc]] + Dictionary holding the search results, with the query ids as keys and the corresponding lists of search + results as the values. + """ + pass + + def close(self): + """Close the searcher.""" + self.searcher.close() + + +class LuceneFlatDenseSearcher: + """Wrapper class for ``FlatDenseSearcher`` in Anserini. + + Parameters + ---------- + index_dir : str + Path to Lucene index directory. + """ + + def __init__(self, index_dir: str, encoder=None, prebuilt_index_name=None): + self.index_dir = index_dir + + args = JFlatDenseSearcherArgs() + args.index = index_dir + + if encoder: + args.encoder = encoder + + self.searcher = JFlatDenseSearcher(args) + + # Keep track if self is a known prebuilt index. + self.prebuilt_index_name = prebuilt_index_name + + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str, encoder=None, verbose=False): + """Build a searcher from a prebuilt index; download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + encoder : str + Encoder name. + verbose : bool + Print status information. + + Returns + ------- + LuceneFlatDenseSearcher + Searcher initialized from the prebuilt index. + """ + if verbose: + print(f'Attempting to initialize prebuilt index {prebuilt_index_name}.') + try: + index_dir = download_prebuilt_index(prebuilt_index_name, verbose=verbose) + except ValueError as e: + print(str(e)) + return None + + if verbose: + print(f'Initializing {prebuilt_index_name}...') + return cls(index_dir, encoder=encoder, prebuilt_index_name=prebuilt_index_name) def search(self, q: str, k: int = 10) -> List[JScoredDoc]: diff --git a/pyserini/util.py b/pyserini/util.py index 2cd85daaa..1649b3585 100644 --- a/pyserini/util.py +++ b/pyserini/util.py @@ -29,7 +29,8 @@ from pyserini.encoded_query_info import QUERY_INFO from pyserini.encoded_corpus_info import CORPUS_INFO from pyserini.evaluate_script_info import EVALUATION_INFO -from pyserini.prebuilt_index_info import TF_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO, LUCENE_HNSW_INDEX_INFO +from pyserini.prebuilt_index_info import TF_INDEX_INFO, IMPACT_INDEX_INFO, \ + LUCENE_HNSW_INDEX_INFO, LUCENE_FLAT_INDEX_INFO, FAISS_INDEX_INFO logger = logging.getLogger(__name__) @@ -222,16 +223,21 @@ def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None): if (index_name not in TF_INDEX_INFO and index_name not in IMPACT_INDEX_INFO and index_name not in LUCENE_HNSW_INDEX_INFO and + index_name not in LUCENE_FLAT_INDEX_INFO and index_name not in FAISS_INDEX_INFO): raise ValueError(f'Unrecognized index name {index_name}') + if index_name in TF_INDEX_INFO: target_index = TF_INDEX_INFO[index_name] elif index_name in IMPACT_INDEX_INFO: target_index = IMPACT_INDEX_INFO[index_name] elif index_name in LUCENE_HNSW_INDEX_INFO: target_index = LUCENE_HNSW_INDEX_INFO[index_name] + elif index_name in LUCENE_FLAT_INDEX_INFO: + target_index = LUCENE_FLAT_INDEX_INFO[index_name] else: target_index = FAISS_INDEX_INFO[index_name] + index_md5 = target_index['md5'] for url in target_index['urls']: local_filename = target_index['filename'] if 'filename' in target_index else None diff --git a/scripts/generate_docs_from_prebuilt_indexes.py b/scripts/generate_docs_from_prebuilt_indexes.py index f49953f9f..c3bca9af1 100644 --- a/scripts/generate_docs_from_prebuilt_indexes.py +++ b/scripts/generate_docs_from_prebuilt_indexes.py @@ -124,7 +124,7 @@ def generate_prebuilt(index): generate_prebuilt(TF_INDEX_INFO_OTHER) print('
') - print('\n\n## Lucene Standard Impact Indexes') + print('\n\n## Lucene Impact Indexes') print('
') print('MS MARCO') @@ -143,6 +143,13 @@ def generate_prebuilt(index): generate_prebuilt(LUCENE_HNSW_INDEX_INFO_BEIR) print('
') + print('\n\n## Lucene Flat Indexes') + + print('
') + print('BEIR') + generate_prebuilt(LUCENE_FLAT_INDEX_INFO_BEIR) + print('
') + print('\n\n## Faiss Indexes') print('
') diff --git a/tests/test_lucene_dense_search.py b/tests/test_lucene_dense_search.py index 4e5f93ad8..01bf01017 100644 --- a/tests/test_lucene_dense_search.py +++ b/tests/test_lucene_dense_search.py @@ -13,15 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import glob +import os import unittest +from pyserini.util import get_cache_home from pyserini.search import get_topics -from pyserini.search.lucene import LuceneHnswDenseSearcher +from pyserini.search.lucene import LuceneHnswDenseSearcher, LuceneFlatDenseSearcher class TestLuceneDenseSearch(unittest.TestCase): - def test_lucene_hnsw_searcher(self): + def test_lucene_hnsw_dense_searcher(self): searcher = LuceneHnswDenseSearcher.from_prebuilt_index( 'beir-v1.0.0-arguana.bge-base-en-v1.5.hnsw', encoder='BgeBaseEn15') topics = get_topics('beir-v1.0.0-arguana-test') @@ -49,6 +51,76 @@ def test_lucene_hnsw_searcher(self): self.assertAlmostEqual(hits[3].score, 0.888884, places=5) self.assertAlmostEqual(hits[4].score, 0.883876, places=5) + # We now run the same test again, but passing in a physical index directory location, as opposed to + # a symbol representing the index. + pattern = os.path.join(get_cache_home(), 'indexes/lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5*') + directories = [d for d in glob.glob(pattern, recursive=False) if os.path.isdir(d)] + + self.assertEqual(len(directories), 1) + index_dir = directories[0] + + searcher = LuceneHnswDenseSearcher(index_dir, encoder='BgeBaseEn15') + hits = searcher.search(q, k=5) + + self.assertEqual(len(hits), 5) + self.assertEqual(hits[1].docid, 'test-culture-ahrtsdlgra-con03a') + self.assertEqual(hits[2].docid, 'test-culture-ahrtsdlgra-con01b') + self.assertEqual(hits[3].docid, 'test-culture-ahrtsdlgra-pro02b') + self.assertEqual(hits[4].docid, 'test-culture-ahrtsdlgra-pro02a') + self.assertAlmostEqual(hits[1].score, 0.893029, places=5) + self.assertAlmostEqual(hits[2].score, 0.892179, places=5) + self.assertAlmostEqual(hits[3].score, 0.888884, places=5) + self.assertAlmostEqual(hits[4].score, 0.883876, places=5) + + def test_lucene_flat_dense_searcher(self): + searcher = LuceneFlatDenseSearcher.from_prebuilt_index( + 'beir-v1.0.0-arguana.bge-base-en-v1.5.flat', encoder='BgeBaseEn15') + topics = get_topics('beir-v1.0.0-arguana-test') + qid = 'test-culture-ahrtsdlgra-con01a' + q = topics[qid]['title'] + + hits = searcher.search(q, k=5) + + # Ground truth results + # test-culture-ahrtsdlgra-con01a Q0 test-culture-ahrtsdlgra-con03a 1 0.893029 Anserini + # test-culture-ahrtsdlgra-con01a Q0 test-culture-ahrtsdlgra-con01b 2 0.892179 Anserini + # test-culture-ahrtsdlgra-con01a Q0 test-culture-ahrtsdlgra-pro02b 3 0.888884 Anserini + # test-culture-ahrtsdlgra-con01a Q0 test-culture-ahrtsdlgra-pro02a 4 0.883876 Anserini + + self.assertEqual(len(hits), 5) + + # Skip first hit, because it's just the query. + self.assertEqual(hits[1].docid, 'test-culture-ahrtsdlgra-con03a') + self.assertEqual(hits[2].docid, 'test-culture-ahrtsdlgra-con01b') + self.assertEqual(hits[3].docid, 'test-culture-ahrtsdlgra-pro02b') + self.assertEqual(hits[4].docid, 'test-culture-ahrtsdlgra-pro02a') + + self.assertAlmostEqual(hits[1].score, 0.893029, places=5) + self.assertAlmostEqual(hits[2].score, 0.892179, places=5) + self.assertAlmostEqual(hits[3].score, 0.888884, places=5) + self.assertAlmostEqual(hits[4].score, 0.883876, places=5) + + # We now run the same test again, but passing in a physical index directory location, as opposed to + # a symbol representing the index. + pattern = os.path.join(get_cache_home(), 'indexes/lucene-flat.beir-v1.0.0-arguana.bge-base-en-v1.5*') + directories = [d for d in glob.glob(pattern, recursive=False) if os.path.isdir(d)] + + self.assertEqual(len(directories), 1) + index_dir = directories[0] + + searcher = LuceneFlatDenseSearcher(index_dir, encoder='BgeBaseEn15') + hits = searcher.search(q, k=5) + + self.assertEqual(len(hits), 5) + self.assertEqual(hits[1].docid, 'test-culture-ahrtsdlgra-con03a') + self.assertEqual(hits[2].docid, 'test-culture-ahrtsdlgra-con01b') + self.assertEqual(hits[3].docid, 'test-culture-ahrtsdlgra-pro02b') + self.assertEqual(hits[4].docid, 'test-culture-ahrtsdlgra-pro02a') + self.assertAlmostEqual(hits[1].score, 0.893029, places=5) + self.assertAlmostEqual(hits[2].score, 0.892179, places=5) + self.assertAlmostEqual(hits[3].score, 0.888884, places=5) + self.assertAlmostEqual(hits[4].score, 0.883876, places=5) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_prebuilt_index.py b/tests/test_prebuilt_index.py index 53e7e2c1a..96e56517e 100644 --- a/tests/test_prebuilt_index.py +++ b/tests/test_prebuilt_index.py @@ -17,10 +17,19 @@ import requests import unittest -from pyserini.prebuilt_index_info import TF_INDEX_INFO, IMPACT_INDEX_INFO, FAISS_INDEX_INFO, LUCENE_HNSW_INDEX_INFO +from pyserini.pyclass import autoclass +from pyserini.prebuilt_index_info import TF_INDEX_INFO, IMPACT_INDEX_INFO, \ + LUCENE_HNSW_INDEX_INFO, LUCENE_FLAT_INDEX_INFO, FAISS_INDEX_INFO class TestPrebuiltIndexes(unittest.TestCase): + def test_index_inf(self): + # Test the accessibility of IndexInfo on the Anserini end to make sure everything is "connected together" + JIndexInfo = autoclass('io.anserini.index.IndexInfo') + + self.assertEqual(JIndexInfo.BEIR_V1_0_0_ARGUANA_BGE_BASE_EN_15_FLAT.indexName, 'beir-v1.0.0-arguana.bge-base-en-v1.5.flat') + self.assertEqual(JIndexInfo.BEIR_V1_0_0_ARGUANA_BGE_BASE_EN_15_FLAT.urls[0], 'https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-flat.beir-v1.0.0-arguana.bge-base-en-v1.5.20240618.6cf601.tar.gz') + def test_lucene_tf_beir(self): urls = [] cnt = 0 @@ -122,6 +131,18 @@ def test_lucene_hnsw_beir(self): self.assertEqual(cnt, 29) self._test_urls(urls) + def test_lucene_flat_beir(self): + urls = [] + cnt = 0 + for key in LUCENE_FLAT_INDEX_INFO: + if 'beir' in key: + cnt += 1 + for url in LUCENE_FLAT_INDEX_INFO[key]['urls']: + urls.append(url) + + self.assertEqual(cnt, 29) + self._test_urls(urls) + def test_faiss_beir(self): urls = [] cnt = 0