From bc139014a6e9248d8d7da337e683c8bad190e5dd Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Tue, 24 Sep 2024 21:27:25 -0400 Subject: [PATCH] Add bindings to Lucene HNSW indexes for MS MARCO v1 passage (#1993) --- docs/2cr/msmarco-v1-doc.html | 61 +- docs/2cr/msmarco-v1-passage.html | 767 ++++++++++++++---- docs/2cr/msmarco-v2-doc.html | 18 +- docs/2cr/msmarco-v2-passage.html | 22 +- docs/prebuilt-indexes.md | 21 + pyserini/2cr/msmarco-v1-doc.yaml | 56 +- pyserini/2cr/msmarco-v1-passage.yaml | 310 ++++--- pyserini/2cr/msmarco-v2-doc.yaml | 16 +- pyserini/2cr/msmarco-v2-passage.yaml | 18 +- pyserini/2cr/msmarco.py | 16 +- pyserini/2cr/msmarco_html_v1_doc.template | 5 +- pyserini/2cr/msmarco_html_v1_passage.template | 5 +- pyserini/2cr/msmarco_html_v2_doc.template | 2 +- pyserini/2cr/msmarco_html_v2_passage.template | 2 +- pyserini/prebuilt_index_info.py | 32 +- ...bge-base-en-v1.5.20240117.53514b.README.md | 28 + ...ge.cosdpr-distil.20240108.825148.README.md | 28 + .../generate_docs_from_prebuilt_indexes.py | 5 + 18 files changed, 1042 insertions(+), 370 deletions(-) create mode 100644 pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5.20240117.53514b.README.md create mode 100644 pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.cosdpr-distil.20240108.825148.README.md diff --git a/docs/2cr/msmarco-v1-doc.html b/docs/2cr/msmarco-v1-doc.html index 9777abfc3..4da6c2b7f 100644 --- a/docs/2cr/msmarco-v1-doc.html +++ b/docs/2cr/msmarco-v1-doc.html @@ -131,7 +131,7 @@ ">
-

MS MARCO V1 Document

+

  MS MARCO V1 Document Regressions

@@ -284,11 +284,11 @@

MS MARCO V1 Document

- + [1] (1b) -BM25 doc segmented (k1=0.9, b=0.4) +BM25 doc seg (k1=0.9, b=0.4) 0.2449 0.5302 0.6871 @@ -508,11 +508,11 @@

MS MARCO V1 Document

- + [1] (1d) -BM25+RM3 doc segmented (k1=0.9, b=0.4) +BM25+RM3 doc seg (k1=0.9, b=0.4) 0.2892 0.5684 0.7368 @@ -732,11 +732,11 @@

MS MARCO V1 Document

- + -BM25+Rocchio doc segmented (k1=0.9, b=0.4) +BM25+Rocchio doc seg (k1=0.9, b=0.4) 0.2889 0.5570 0.7423 @@ -957,11 +957,11 @@

MS MARCO V1 Document

- + -BM25 doc segmented (k1=2.16, b=0.61) +BM25 doc seg (k1=2.16, b=0.61) 0.2398 0.5389 0.6565 @@ -1181,11 +1181,11 @@

MS MARCO V1 Document

- + -BM25+RM3 doc segmented (k1=2.16, b=0.61) +BM25+RM3 doc seg (k1=2.16, b=0.61) 0.2655 0.5392 0.7037 @@ -1405,11 +1405,11 @@

MS MARCO V1 Document

- + -BM25+Rocchio doc segmented (k1=2.16, b=0.61) +BM25+Rocchio doc seg (k1=2.16, b=0.61) 0.2672 0.5421 0.7115 @@ -1630,11 +1630,11 @@

MS MARCO V1 Document

- + [1] (2b) -BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) +BM25 w/ doc2query-T5 doc seg (k1=0.9, b=0.4) 0.2798 0.6119 0.7165 @@ -1854,11 +1854,11 @@

MS MARCO V1 Document

- + [1] (2d) -BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) +BM25+RM3 w/ doc2query-T5 doc seg (k1=0.9, b=0.4) 0.3030 0.6290 0.7483 @@ -2079,11 +2079,11 @@

MS MARCO V1 Document

- + -BM25 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59) +BM25 w/ doc2query-T5 doc seg (k1=2.56, b=0.59) 0.2658 0.6273 0.6707 @@ -2303,11 +2303,11 @@

MS MARCO V1 Document

- + -BM25+RM3 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59) +BM25+RM3 w/ doc2query-T5 doc seg (k1=2.56, b=0.59) 0.2892 0.6247 0.7069 @@ -2416,11 +2416,11 @@

MS MARCO V1 Document

- + [1] (3a) -uniCOIL (noexp): pre-encoded queries +uniCOIL (noexp): cached queries 0.2665 0.6349 0.6391 @@ -2528,11 +2528,11 @@

MS MARCO V1 Document

- + -uniCOIL (noexp): query inference with PyTorch +uniCOIL (noexp): PyTorch 0.2665 0.6349 0.6391 @@ -2644,11 +2644,11 @@

MS MARCO V1 Document

- + [1] (3b) -uniCOIL (w/ doc2query-T5): pre-encoded queries +uniCOIL (w/ doc2query-T5): cached queries 0.2789 0.6396 0.6652 @@ -2756,11 +2756,11 @@

MS MARCO V1 Document

- + -uniCOIL (w/ doc2query-T5): query inference with PyTorch +uniCOIL (w/ doc2query-T5): PyTorch 0.2789 0.6396 0.6652 @@ -2876,6 +2876,9 @@

MS MARCO V1 Document

+
+

References

+
@@ -173,11 +173,11 @@

MS MARCO V1 Passage

- + [1] (1a) -BM25 (k1=0.9, b=0.4) +BM25 (k1=0.9, b=0.4): Lucene 0.3013 0.5058 0.7501 @@ -285,11 +285,11 @@

MS MARCO V1 Passage

- + [1] (1b) -BM25+RM3 (k1=0.9, b=0.4) +BM25+RM3 (k1=0.9, b=0.4): Lucene 0.3416 0.5216 0.8136 @@ -397,11 +397,11 @@

MS MARCO V1 Passage

- + -BM25+Rocchio (k1=0.9, b=0.4) +BM25+Rocchio (k1=0.9, b=0.4): Lucene 0.3474 0.5275 0.8007 @@ -510,11 +510,11 @@

MS MARCO V1 Passage

- + -BM25 (k1=0.82, b=0.68) +BM25 (k1=0.82, b=0.68): Lucene 0.2903 0.4973 0.7450 @@ -619,11 +619,11 @@

MS MARCO V1 Passage

- + -BM25+RM3 (k1=0.82, b=0.68) +BM25+RM3 (k1=0.82, b=0.68): Lucene 0.3339 0.5147 0.7950 @@ -731,11 +731,11 @@

MS MARCO V1 Passage

- + -BM25+Rocchio (k1=0.82, b=0.68) +BM25+Rocchio (k1=0.82, b=0.68): Lucene 0.3396 0.5275 0.7948 @@ -844,11 +844,11 @@

MS MARCO V1 Passage

- + [1] (2a) -BM25 w/ doc2query-T5 (k1=0.9, b=0.4) +BM25 w/ doc2query-T5 (k1=0.9, b=0.4): Lucene 0.4034 0.6417 0.8310 @@ -956,11 +956,11 @@

MS MARCO V1 Passage

- + [1] (2b) -BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4) +BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4): Lucene 0.4483 0.6586 0.8863 @@ -1068,11 +1068,11 @@

MS MARCO V1 Passage

- + -BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4) +BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4): Lucene 0.4469 0.6538 0.8855 @@ -1181,11 +1181,11 @@

MS MARCO V1 Passage

- + -BM25 w/ doc2query-T5 (k1=2.18, b=0.86) +BM25 w/ doc2query-T5 (k1=2.18, b=0.86): Lucene 0.4046 0.6336 0.8134 @@ -1293,11 +1293,11 @@

MS MARCO V1 Passage

- + -BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86) +BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86): Lucene 0.4377 0.6537 0.8443 @@ -1405,11 +1405,11 @@

MS MARCO V1 Passage

- + -BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86) +BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86): Lucene 0.4339 0.6559 0.8465 @@ -1518,11 +1518,11 @@

MS MARCO V1 Passage

- + [1] (3b) -uniCOIL (w/ doc2query-T5): cached queries +uniCOIL (w/ doc2query-T5): Lucene, cached queries 0.4612 0.7024 0.8292 @@ -1630,11 +1630,11 @@

MS MARCO V1 Passage

- + -uniCOIL (w/ doc2query-T5): PyTorch +uniCOIL (w/ doc2query-T5): Lucene, PyTorch 0.4612 0.7024 0.8292 @@ -1745,11 +1745,11 @@

MS MARCO V1 Passage

- + -uniCOIL (w/ doc2query-T5): ONNX +uniCOIL (w/ doc2query-T5): Lucene, ONNX 0.4612 0.7024 0.8292 @@ -1861,11 +1861,11 @@

MS MARCO V1 Passage

- + [1] (3a) -uniCOIL (noexp): cached queries +uniCOIL (noexp): Lucene, cached queries 0.4033 0.6433 0.7752 @@ -1973,11 +1973,11 @@

MS MARCO V1 Passage

- + -uniCOIL (noexp): PyTorch +uniCOIL (noexp): Lucene, PyTorch 0.4033 0.6433 0.7752 @@ -2088,11 +2088,11 @@

MS MARCO V1 Passage

- + -uniCOIL (noexp): ONNX +uniCOIL (noexp): Lucene, ONNX 0.4059 0.6535 0.7811 @@ -2204,11 +2204,11 @@

MS MARCO V1 Passage

- + [2] -SPLADE++ EnsembleDistil: PyTorch +SPLADE++ EnsembleDistil: Lucene, PyTorch 0.5050 0.7308 0.8728 @@ -2319,11 +2319,11 @@

MS MARCO V1 Passage

- + [2] -SPLADE++ EnsembleDistil: ONNX +SPLADE++ EnsembleDistil: Lucene, ONNX 0.5050 0.7308 0.8728 @@ -2434,11 +2434,11 @@

MS MARCO V1 Passage

- + -SPLADE++ EnsembleDistil w/ Rocchio: PyTorch +SPLADE++ EnsembleDistil w/ Rocchio: Lucene, PyTorch 0.5140 0.7119 0.8799 @@ -2549,11 +2549,11 @@

MS MARCO V1 Passage

- + -SPLADE++ EnsembleDistil w/ Rocchio: ONNX +SPLADE++ EnsembleDistil w/ Rocchio: Lucene, ONNX 0.5140 0.7119 0.8799 @@ -2664,11 +2664,11 @@

MS MARCO V1 Passage

- + [2] -SPLADE++ SelfDistil: PyTorch +SPLADE++ SelfDistil: Lucene, PyTorch 0.4998 0.7358 0.8761 @@ -2779,11 +2779,11 @@

MS MARCO V1 Passage

- + [2] -SPLADE++ SelfDistil: ONNX +SPLADE++ SelfDistil: Lucene, ONNX 0.4998 0.7358 0.8761 @@ -2894,11 +2894,11 @@

MS MARCO V1 Passage

- + -SPLADE++ SelfDistil w/ Rocchio: PyTorch +SPLADE++ SelfDistil w/ Rocchio: Lucene, PyTorch 0.5072 0.7156 0.8918 @@ -3009,11 +3009,11 @@

MS MARCO V1 Passage

- + -SPLADE++ SelfDistil w/ Rocchio: ONNX +SPLADE++ SelfDistil w/ Rocchio: Lucene, ONNX 0.5072 0.7156 0.8918 @@ -3125,11 +3125,11 @@

MS MARCO V1 Passage

- + [3] -ANCE: cached queries +ANCE: Faiss flat, cached queries 0.3710 0.6452 0.7554 @@ -3234,11 +3234,11 @@

MS MARCO V1 Passage

- + [3] -ANCE: PyTorch +ANCE: Faiss flat, PyTorch 0.3710 0.6452 0.7554 @@ -3346,11 +3346,11 @@

MS MARCO V1 Passage

- + [9] -ANCE w/ Average PRF: PyTorch +ANCE w/ Average PRF: Faiss flat, PyTorch 0.4247 0.6532 0.7739 @@ -3461,11 +3461,11 @@

MS MARCO V1 Passage

- + [9] -ANCE w/ Rocchio PRF: PyTorch +ANCE w/ Rocchio PRF: Faiss flat, PyTorch 0.4211 0.6539 0.7825 @@ -3577,11 +3577,11 @@

MS MARCO V1 Passage

- + [10] -SBERT: PyTorch +SBERT: Faiss flat, PyTorch 0.4060 0.6930 0.7872 @@ -3689,11 +3689,11 @@

MS MARCO V1 Passage

- + [9] -SBERT w/ Average PRF: PyTorch +SBERT w/ Average PRF: Faiss flat, PyTorch 0.4354 0.7001 0.7937 @@ -3804,11 +3804,11 @@

MS MARCO V1 Passage

- + [9] -SBERT w/ Rocchio PRF: PyTorch +SBERT w/ Rocchio PRF: Faiss flat, PyTorch 0.4371 0.6952 0.7941 @@ -3920,11 +3920,11 @@

MS MARCO V1 Passage

- + [4] -DistilBERT KD: cached queries +DistilBERT KD: Faiss flat, cached queries 0.4053 0.6994 0.7653 @@ -4029,11 +4029,11 @@

MS MARCO V1 Passage

- + [4] -DistilBERT KD: PyTorch +DistilBERT KD: Faiss flat, PyTorch 0.4053 0.6994 0.7653 @@ -4141,11 +4141,11 @@

MS MARCO V1 Passage

- + [5] -DistilBERT KD TASB: cached queries +DistilBERT KD TASB: Faiss flat, cached queries 0.4590 0.7210 0.8406 @@ -4250,11 +4250,11 @@

MS MARCO V1 Passage

- + [5] -DistilBERT KD TASB: PyTorch +DistilBERT KD TASB: Faiss flat, PyTorch 0.4590 0.7210 0.8406 @@ -4362,11 +4362,11 @@

MS MARCO V1 Passage

- + [9] -DistilBERT KD TASB w/ Average PRF: PyTorch +DistilBERT KD TASB w/ Average PRF: Faiss flat, PyTorch 0.4856 0.7190 0.8517 @@ -4477,11 +4477,11 @@

MS MARCO V1 Passage

- + [9] -DistilBERT KD TASB w/ Rocchio PRF: PyTorch +DistilBERT KD TASB w/ Rocchio PRF: Faiss flat, PyTorch 0.4974 0.7231 0.8775 @@ -4593,11 +4593,11 @@

MS MARCO V1 Passage

- + [6] -TCT_ColBERT-V2-HN+: cached queries +TCT_ColBERT-V2-HN+: Faiss flat, cached queries 0.4469 0.7204 0.8261 @@ -4702,11 +4702,11 @@

MS MARCO V1 Passage

- + [6] -TCT_ColBERT-V2-HN+: PyTorch +TCT_ColBERT-V2-HN+: Faiss flat, PyTorch 0.4469 0.7204 0.8261 @@ -4814,11 +4814,11 @@

MS MARCO V1 Passage

- + [9] -TCT_ColBERT-V2-HN+ w/ Average PRF: PyTorch +TCT_ColBERT-V2-HN+ w/ Average PRF: Faiss flat, PyTorch 0.4879 0.7312 0.8586 @@ -4929,11 +4929,11 @@

MS MARCO V1 Passage

- + [9] -TCT_ColBERT-V2-HN+ w/ Rocchio PRF: PyTorch +TCT_ColBERT-V2-HN+ w/ Rocchio PRF: Faiss flat, PyTorch 0.4883 0.7111 0.8694 @@ -5282,11 +5282,11 @@

MS MARCO V1 Passage

- + [7] -SLIM: PyTorch +SLIM: Lucene, PyTorch 0.4509 0.7010 0.8241 @@ -5400,11 +5400,11 @@

MS MARCO V1 Passage

- + [7] -SLIM++: PyTorch +SLIM++: Lucene, PyTorch 0.4687 0.7140 0.8415 @@ -5519,11 +5519,11 @@

MS MARCO V1 Passage

- + [8] -Aggretriever-DistilBERT: PyTorch +Aggretriever-DistilBERT: Faiss flat, PyTorch 0.4301 0.6816 0.8023 @@ -5631,11 +5631,11 @@

MS MARCO V1 Passage

- + [8] -Aggretriever-coCondenser: PyTorch +Aggretriever-coCondenser: Faiss flat, PyTorch 0.4350 0.6837 0.8078 @@ -5744,11 +5744,11 @@

MS MARCO V1 Passage

- + [11] -OpenAI ada2: cached queries +OpenAI ada2: Faiss flat, cached queries 0.4788 0.7035 0.8629 @@ -5853,11 +5853,11 @@

MS MARCO V1 Passage

- + [12] -HyDE-OpenAI ada2: cached queries +HyDE-OpenAI ada2: Faiss flat, cached queries 0.5125 0.7163 0.9002 @@ -5943,11 +5943,11 @@

MS MARCO V1 Passage

- + -OpenAI text-embedding-3-large: cached queries +OpenAI text-embedding-3-large: Faiss flat, cached queries 0.5259 0.7173 0.8991 @@ -6053,11 +6053,11 @@

MS MARCO V1 Passage

- + [13] -cosDPR-distil: PyTorch +cosDPR-distil: Faiss flat, PyTorch 0.4656 0.7250 0.8201 @@ -6096,20 +6096,20 @@

MS MARCO V1 Passage

python -m pyserini.search.faiss \
   --threads 16 --batch-size 512 \
-  --index msmarco-v1-passage.cosdpr-distil  \
+  --index msmarco-v1-passage.cosdpr-distil \
   --topics dl19-passage \
   --encoder castorini/cosdpr-distil \
-  --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+  --output run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl19.txt
 
Evaluation commands:
python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl19.txt
 python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl19.txt
 python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl19.txt
 
@@ -6120,20 +6120,20 @@

MS MARCO V1 Passage

python -m pyserini.search.faiss \
   --threads 16 --batch-size 512 \
-  --index msmarco-v1-passage.cosdpr-distil  \
+  --index msmarco-v1-passage.cosdpr-distil \
   --topics dl20 \
   --encoder castorini/cosdpr-distil \
-  --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+  --output run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl20.txt
 
Evaluation commands:
python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl20.txt
 python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl20.txt
 python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dl20.txt
 
@@ -6144,18 +6144,18 @@

MS MARCO V1 Passage

python -m pyserini.search.faiss \
   --threads 16 --batch-size 512 \
-  --index msmarco-v1-passage.cosdpr-distil  \
+  --index msmarco-v1-passage.cosdpr-distil \
   --topics msmarco-passage-dev-subset \
   --encoder castorini/cosdpr-distil \
-  --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+  --output run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dev.txt
 
Evaluation commands:
python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dev.txt
 python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \
-  run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+  run.msmarco-v1-passage.cosdpr-distil.faiss-flat.pytorch.dev.txt
 
@@ -6165,12 +6165,242 @@

MS MARCO V1 Passage

- - + +[13] +cosDPR-distil: Lucene HNSW, ONNX +0.4660 +0.7250 +0.8222 + +0.4876 +0.7025 +0.8540 + +0.3887 +0.9765 + + + + +
+ + + + + + +
+
+Command to generate run on TREC 2019 queries: + +
+
python -m pyserini.search.lucene \
+  --threads 16 --batch-size 512 --dense --hnsw \
+  --index msmarco-v1-passage.cosdpr-distil.hnsw \
+  --topics dl19-passage \
+  --onnx-encoder CosDprDistil \
+  --output run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl19.txt \
+  --hits 1000 --ef-search 1000
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl19.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl19.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl19.txt
+
+
+ +
+
+ Command to generate run on TREC 2020 queries: + +
+
python -m pyserini.search.lucene \
+  --threads 16 --batch-size 512 --dense --hnsw \
+  --index msmarco-v1-passage.cosdpr-distil.hnsw \
+  --topics dl20 \
+  --onnx-encoder CosDprDistil \
+  --output run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl20.txt \
+  --hits 1000 --ef-search 1000
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl20.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl20.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dl20.txt
+
+
+ +
+
+ Command to generate run on dev queries: + +
+
python -m pyserini.search.lucene \
+  --threads 16 --batch-size 512 --dense --hnsw \
+  --index msmarco-v1-passage.cosdpr-distil.hnsw \
+  --topics msmarco-passage-dev-subset \
+  --onnx-encoder CosDprDistil \
+  --output run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dev.txt \
+  --hits 1000 --ef-search 1000
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dev.txt
+python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw.onnx.dev.txt
+
+
+ +
+
+ + +
+ + + + +[13] +cosDPR-distil: Lucene quantized HNSW, ONNX +0.4664 +0.7247 +0.8218 + +0.4871 +0.6996 +0.8538 + +0.3899 +0.9764 + + + + +
+ + + + + + +
+
+Command to generate run on TREC 2019 queries: + +
+
python -m pyserini.search.lucene \
+  --threads 16 --batch-size 512 --dense --hnsw \
+  --index msmarco-v1-passage.cosdpr-distil.hnsw-int8 \
+  --topics dl19-passage \
+  --onnx-encoder CosDprDistil \
+  --output run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl19.txt \
+  --hits 1000 --ef-search 1000
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl19.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl19.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl19.txt
+
+
+ +
+
+ Command to generate run on TREC 2020 queries: + +
+
python -m pyserini.search.lucene \
+  --threads 16 --batch-size 512 --dense --hnsw \
+  --index msmarco-v1-passage.cosdpr-distil.hnsw-int8 \
+  --topics dl20 \
+  --onnx-encoder CosDprDistil \
+  --output run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl20.txt \
+  --hits 1000 --ef-search 1000
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl20.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl20.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dl20.txt
+
+
+ +
+
+ Command to generate run on dev queries: + +
+
python -m pyserini.search.lucene \
+  --threads 16 --batch-size 512 --dense --hnsw \
+  --index msmarco-v1-passage.cosdpr-distil.hnsw-int8 \
+  --topics msmarco-passage-dev-subset \
+  --onnx-encoder CosDprDistil \
+  --output run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dev.txt \
+  --hits 1000 --ef-search 1000
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dev.txt
+python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cosdpr-distil.lucene-hnsw-int8.onnx.dev.txt
+
+
+ +
+
+ + +
+ + + + + [14] -BGE-base-en-v1.5: PyTorch +BGE-base-en-v1.5: Faiss flat, PyTorch 0.4485 0.7016 0.8427 @@ -6185,25 +6415,25 @@

MS MARCO V1 Passage

-
+
-
+
+

References

+
  • [1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. diff --git a/pyserini/2cr/msmarco_html_v2_doc.template b/pyserini/2cr/msmarco_html_v2_doc.template index 7d44926a5..62b831ab2 100644 --- a/pyserini/2cr/msmarco_html_v2_doc.template +++ b/pyserini/2cr/msmarco_html_v2_doc.template @@ -131,7 +131,7 @@ pre[class*="prettyprint"] { ">

    -

    $title

    +

      $title

diff --git a/pyserini/2cr/msmarco_html_v2_passage.template b/pyserini/2cr/msmarco_html_v2_passage.template index 46959db1a..f0aced5a3 100644 --- a/pyserini/2cr/msmarco_html_v2_passage.template +++ b/pyserini/2cr/msmarco_html_v2_passage.template @@ -131,7 +131,7 @@ pre[class*="prettyprint"] { ">
-

$title

+

  $title

diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index b552b9db5..3d1163037 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -19,11 +19,14 @@ JIndexInfo = autoclass('io.anserini.index.IndexInfo') -def add_lucene_index_info(enum, info): - info[enum.indexName] = { +def add_lucene_index_info(enum, info, name=None, readme=None): + # Allow ability to override what's stored in the enum. + indexName = enum.indexName if not name else name + + info[indexName] = { "description": enum.description, "filename": enum.filename, - "readme": enum.readme, + "readme": enum.readme if not readme else readme, "urls": [ enum.urls[0] ], @@ -3365,7 +3368,25 @@ def add_lucene_index_info(enum, info): **IMPACT_INDEX_INFO_BEIR} -# Bindings for Lucene HNSW indexes +# Bindings for Lucene HNSW MSMARCO indexes +LUCENE_HNSW_INDEX_INFO_MSMARCO = {} + +# Metadata have already been defined in Anserini, just copy over into Pyserini. +# Here, we override the index name to append '.hnsw', which isn't present on the Anserini side +add_lucene_index_info(JIndexInfo.MSMARCO_V1_PASSAGE_BGE_BASE_EN_15, LUCENE_HNSW_INDEX_INFO_MSMARCO, + name='msmarco-v1-passage.bge-base-en-v1.5.hnsw', + readme='lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5.20240117.53514b.README.md') +add_lucene_index_info(JIndexInfo.MSMARCO_V1_PASSAGE_BGE_BASE_EN_15_QUANTIZED, LUCENE_HNSW_INDEX_INFO_MSMARCO, + name='msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8', + readme='lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5.20240117.53514b.README.md') +add_lucene_index_info(JIndexInfo.MSMARCO_V1_PASSAGE_COS_DPR_DISTIL, LUCENE_HNSW_INDEX_INFO_MSMARCO, + name='msmarco-v1-passage.cosdpr-distil.hnsw', + readme='lucene-hnsw.msmarco-v1-passage.cosdpr-distil.20240108.825148.README.md') +add_lucene_index_info(JIndexInfo.MSMARCO_V1_PASSAGE_COS_DPR_DISTIL_QUANTIZED, LUCENE_HNSW_INDEX_INFO_MSMARCO, + name='msmarco-v1-passage.cosdpr-distil.hnsw-int8', + readme='lucene-hnsw.msmarco-v1-passage.cosdpr-distil.20240108.825148.README.md') + +# Bindings for Lucene HNSW BEIR indexes LUCENE_HNSW_INDEX_INFO_BEIR = {} # Metadata have already been defined in Anserini, just copy over into Pyserini. @@ -3399,7 +3420,8 @@ def add_lucene_index_info(enum, info): add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_CLIMATE_FEVER_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR) add_lucene_index_info(JIndexInfo.BEIR_V1_0_0_SCIFACT_BGE_BASE_EN_15_HNSW, LUCENE_HNSW_INDEX_INFO_BEIR) -LUCENE_HNSW_INDEX_INFO = {**LUCENE_HNSW_INDEX_INFO_BEIR} +LUCENE_HNSW_INDEX_INFO = {**LUCENE_HNSW_INDEX_INFO_MSMARCO, + **LUCENE_HNSW_INDEX_INFO_BEIR} # Bindings for Lucene flat indexes diff --git a/pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5.20240117.53514b.README.md b/pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5.20240117.53514b.README.md new file mode 100644 index 000000000..4ca08890e --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5.20240117.53514b.README.md @@ -0,0 +1,28 @@ +# msmarco-v1-passage.bge-base-en-v1.5 + +Lucene HNSW indexes of MS MARCO V1 Passage using BGE-base-en-v1.5. + +These indexes were built 2024/01/17 on `orca` at Anserini commit [`53514b`](https://github.com/castorini/anserini/commit/53514b1ab29398a4bb6ff4a315b7394e509e6be5) (2024/01/13), with Lucene 9.9.1. + +Here are the indexing commands for the non-quantized and quantized versions: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-bge-base-en-v1.5/ \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-bge-base-en-v1.5.efC1000.1/ \ + -threads 8 -M 16 -efC 1000 -memoryBuffer 65536 -optimize \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.efC1000.1 & + +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-bge-base-en-v1.5/ \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-bge-base-en-v1.5-int8.efC1000.1/ \ + -threads 8 -M 16 -efC 1000 -memoryBuffer 65536 -optimize -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5-int8.efC1000.1 & +``` + +I ran four trials and picked the index instance that yielded the highest retrieval scores. +Most of the trials yielded scores that were close; I selected the "best" based on eyeballing. diff --git a/pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.cosdpr-distil.20240108.825148.README.md b/pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.cosdpr-distil.20240108.825148.README.md new file mode 100644 index 000000000..a240deda5 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-hnsw.msmarco-v1-passage.cosdpr-distil.20240108.825148.README.md @@ -0,0 +1,28 @@ +# msmarco-v1-passage.cosdpr-distil + +Lucene HNSW indexes of MS MARCO V1 Passage using cosDPR-distil. + +These indexes were built 2024/01/07 on `orca` at Anserini commit [`825148`](https://github.com/castorini/anserini/commit/825148afba0303276c37dd838be897b8443d9774) (2023/12/24), with Lucene 9.9.1. + +Here are the indexing commands for the non-quantized and quantized versions: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-cos-dpr-distil/ \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil.efC1000.1/ \ + -threads 8 -M 16 -efC 1000 -memoryBuffer 65536 -optimize \ + >& logs/log.msmarco-passage-cos-dpr-distil.efC1000.1 & + +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-cos-dpr-distil/ \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8.efC1000.1/ \ + -threads 8 -M 16 -efC 1000 -memoryBuffer 65536 -optimize -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil-int8.efC1000.1 & +``` + +I ran four trials and picked the index instance that yielded the highest retrieval scores. +Most of the trials yielded scores that were close; I selected the "best" based on eyeballing. diff --git a/scripts/generate_docs_from_prebuilt_indexes.py b/scripts/generate_docs_from_prebuilt_indexes.py index c3bca9af1..8526610db 100644 --- a/scripts/generate_docs_from_prebuilt_indexes.py +++ b/scripts/generate_docs_from_prebuilt_indexes.py @@ -138,6 +138,11 @@ def generate_prebuilt(index): print('\n\n## Lucene HNSW Indexes') + print('
') + print('MS MARCO') + generate_prebuilt(LUCENE_HNSW_INDEX_INFO_MSMARCO) + print('
') + print('
') print('BEIR') generate_prebuilt(LUCENE_HNSW_INDEX_INFO_BEIR)