diff --git a/docs/2cr/miracl.html b/docs/2cr/miracl.html index 10a58f99d..19588ce62 100644 --- a/docs/2cr/miracl.html +++ b/docs/2cr/miracl.html @@ -655,13 +655,13 @@

MIRACL

mDPR (tied encoders), pre-FT w/ MS MARCO 0.499 -0.462 +0.443 0.394 0.478 0.480 0.472 0.435 -0.390 +0.383 0.272 0.439 0.419 @@ -673,7 +673,7 @@

MIRACL

0.490 0.444 -0.422 +0.421 @@ -1136,13 +1136,13 @@

MIRACL

mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi 0.578 -0.592 +0.580 0.281 0.251 0.384 0.569 0.301 -0.332 +0.329 0.346 0.500 0.486 @@ -1154,7 +1154,7 @@

MIRACL

0.322 0.598 -0.463 +0.462 @@ -1617,13 +1617,13 @@

MIRACL

Hybrid of `bm25` and `mdpr-tied-pft-msmarco` 0.673 -0.671 +0.654 0.549 0.641 0.594 0.672 0.523 -0.615 +0.616 0.443 0.576 0.609 @@ -1635,7 +1635,7 @@

MIRACL

0.564 0.611 -0.580 +0.579 @@ -3486,13 +3486,13 @@

MIRACL

mDPR (tied encoders), pre-FT w/ MS MARCO 0.841 -0.831 +0.819 0.768 0.864 0.898 0.788 0.915 -0.781 +0.776 0.573 0.825 0.737 @@ -3504,7 +3504,7 @@

MIRACL

0.898 0.840 -0.798 +0.797 @@ -3967,13 +3967,13 @@

MIRACL

mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi 0.795 -0.866 +0.848 0.508 0.471 0.686 0.798 0.601 -0.635 +0.637 0.584 0.745 0.718 @@ -3985,7 +3985,7 @@

MIRACL

0.599 0.891 -0.718 +0.717 @@ -4448,13 +4448,13 @@

MIRACL

Hybrid of `bm25` and `mdpr-tied-pft-msmarco` 0.941 -0.949 +0.932 0.882 0.948 0.937 0.895 0.965 -0.915 +0.912 0.768 0.904 0.900 @@ -4466,7 +4466,7 @@

MIRACL

0.948 0.950 -0.897 +0.895 diff --git a/docs/2cr/mrtydi.html b/docs/2cr/mrtydi.html index 012cf4fd5..0d16bf504 100644 --- a/docs/2cr/mrtydi.html +++ b/docs/2cr/mrtydi.html @@ -746,7 +746,7 @@

Mr.TyDi

mDPR (tied encoders), pre-FT w/ NQ 0.221 -0.255 +0.254 0.243 0.244 0.281 @@ -1046,7 +1046,7 @@

Mr.TyDi

mDPR (tied encoders), pre-FT w/ MS MARCO 0.441 -0.417 +0.397 0.327 0.275 0.352 @@ -1057,7 +1057,7 @@

Mr.TyDi

0.310 0.269 -0.335 +0.333 @@ -1346,7 +1346,7 @@

Mr.TyDi

mDPR (tied encoders), pre-FT w/ MS MARCO, FT w/ all 0.695 -0.643 +0.623 0.492 0.559 0.578 @@ -1357,7 +1357,7 @@

Mr.TyDi

0.891 0.618 -0.602 +0.600 @@ -2251,7 +2251,7 @@

Mr.TyDi

mDPR (tied encoders), pre-FT w/ NQ 0.600 -0.716 +0.707 0.689 0.640 0.691 @@ -2262,7 +2262,7 @@

Mr.TyDi

0.245 0.455 -0.580 +0.579 @@ -2551,7 +2551,7 @@

Mr.TyDi

mDPR (tied encoders), pre-FT w/ MS MARCO 0.797 -0.820 +0.784 0.754 0.647 0.736 @@ -2562,7 +2562,7 @@

Mr.TyDi

0.782 0.595 -0.714 +0.711 diff --git a/docs/2cr/msmarco-v1-passage.html b/docs/2cr/msmarco-v1-passage.html index 21ea2a719..bd44639db 100644 --- a/docs/2cr/msmarco-v1-passage.html +++ b/docs/2cr/msmarco-v1-passage.html @@ -5747,7 +5747,7 @@

MS MARCO V1 Passage

- +[11] OpenAI ada2: pre-encoded queries 0.4788 0.7035 @@ -5856,7 +5856,7 @@

MS MARCO V1 Passage

- +[12] HyDE-OpenAI ada2: pre-encoded queries 0.5125 0.7163 @@ -5941,6 +5941,119 @@

MS MARCO V1 Passage

+ + + + + + +[13] +cosDPR-distil: PyTorch +0.4656 +0.7250 +0.8201 + +0.4876 +0.7025 +0.8533 + +0.3896 +0.9796 + + + + +
+ + + + + + +
+
+Command to generate run on TREC 2019 queries: + +
+
python -m pyserini.search.faiss \
+  --threads 16 --batch-size 512 \
+  --index msmarco-v1-passage.cosdpr-distil  \
+  --topics dl19-passage \
+  --encoder castorini/cosdpr-distil \
+  --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+
+
+ +
+
+ Command to generate run on TREC 2020 queries: + +
+
python -m pyserini.search.faiss \
+  --threads 16 --batch-size 512 \
+  --index msmarco-v1-passage.cosdpr-distil  \
+  --topics dl20 \
+  --encoder castorini/cosdpr-distil \
+  --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+
+
+ +
+
+ Command to generate run on dev queries: + +
+
python -m pyserini.search.faiss \
+  --threads 16 --batch-size 512 \
+  --index msmarco-v1-passage.cosdpr-distil  \
+  --topics msmarco-passage-dev-subset \
+  --encoder castorini/cosdpr-distil \
+  --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+
+
+ +
+
+ +
@@ -5992,6 +6105,18 @@

MS MARCO V1 Passage

Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), 2019.

+
  • [11] Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. +Vector Search with OpenAI Embeddings: Lucene Is All You Need. +arXiv:2308.14963, August 2023.

  • + +
  • [12] Luyu Gao, Xueguang Ma, Jimmy Lin, and Jamie Callan. +Precise Zero-Shot Dense Retrieval without Relevance Labels. +Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1762-1777, July 2023, Toronto, Canada.

  • + +
  • [13] Xueguang Ma, Tommaso Teofili, and Jimmy Lin. +Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes. +Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023), October 2023, pages 5366–5370, Birmingham, the United Kingdom.

  • +
    diff --git a/pyserini/2cr/msmarco-v1-passage.yaml b/pyserini/2cr/msmarco-v1-passage.yaml index 8389603a9..aa4bdba91 100644 --- a/pyserini/2cr/msmarco-v1-passage.yaml +++ b/pyserini/2cr/msmarco-v1-passage.yaml @@ -1,4 +1,27 @@ conditions: + - name: cosdpr-distil-pytorch + display: "cosDPR-distil: PyTorch" + display-html: "cosDPR-distil: PyTorch" + display-row: "[13]" + command: python -m pyserini.search.faiss --threads ${dense_threads} --batch-size ${dense_batch_size} --index msmarco-v1-passage.cosdpr-distil --topics $topics --encoder castorini/cosdpr-distil --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3896 + R@1K: 0.9796 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4656 + nDCG@10: 0.7250 + R@1K: 0.8201 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4876 + nDCG@10: 0.7025 + R@1K: 0.8533 - name: splade-pp-ed-rocchio-pytorch display: "SPLADE++ EnsembleDistil w/ Rocchio: PyTorch" display-html: "SPLADE++ EnsembleDistil w/ Rocchio: PyTorch" @@ -1117,6 +1140,7 @@ conditions: - name: openai-ada2 display: "OpenAI ada2: pre-encoded queries" display-html: "OpenAI ada2: pre-encoded queries" + display-row: "[11]" command: python -m pyserini.search.faiss --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics --output $output topics: - topic_key: msmarco-passage-dev-subset @@ -1139,6 +1163,7 @@ conditions: - name: openai-ada2-hyde display: "HyDE-OpenAI ada2: pre-encoded queries" display-html: "HyDE-OpenAI ada2: pre-encoded queries" + display-row: "[12]" command: python -m pyserini.search.faiss --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics-hyde --output $output topics: - topic_key: dl19-passage diff --git a/pyserini/2cr/msmarco.py b/pyserini/2cr/msmarco.py index cf4d6a507..bb732d319 100644 --- a/pyserini/2cr/msmarco.py +++ b/pyserini/2cr/msmarco.py @@ -102,7 +102,10 @@ 'aggretriever-cocondenser-pytorch', '', 'openai-ada2', - 'openai-ada2-hyde'], + 'openai-ada2-hyde', + '', + 'cosdpr-distil-pytorch' + ], # MS MARCO v1 doc 'msmarco-v1-doc': diff --git a/pyserini/2cr/msmarco_html_v1_passage.template b/pyserini/2cr/msmarco_html_v1_passage.template index 19e5bf6e4..80fe5cfbb 100644 --- a/pyserini/2cr/msmarco_html_v1_passage.template +++ b/pyserini/2cr/msmarco_html_v1_passage.template @@ -223,6 +223,18 @@ $rows Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), 2019.

    +
  • [11] Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. +Vector Search with OpenAI Embeddings: Lucene Is All You Need. +arXiv:2308.14963, August 2023.

  • + +
  • [12] Luyu Gao, Xueguang Ma, Jimmy Lin, and Jamie Callan. +Precise Zero-Shot Dense Retrieval without Relevance Labels. +Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1762-1777, July 2023, Toronto, Canada.

  • + +
  • [13] Xueguang Ma, Tommaso Teofili, and Jimmy Lin. +Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes. +Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023), October 2023, pages 5366–5370, Birmingham, the United Kingdom.

  • +
    diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 29a017154..940738352 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -3130,9 +3130,22 @@ **IMPACT_INDEX_INFO_BEIR} FAISS_INDEX_INFO_MSMARCO = { + "msmarco-v1-passage.cosdpr-distil": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by cosDPR-distil.", + "filename": "faiss.msmarco-v1-passage.cosdpr-distil.20221023.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.cosdpr-distil.20221023.tar.gz" + ], + "md5": "02018b5797bf1e7ebe6e2f552319696a", + "size compressed (bytes)": 23843194944, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + # Aggretriever indexes "msmarco-v1-passage.aggretriever-cocondenser": { - "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-cocondenser encoder.", + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-cocondenser.", "filename": "faiss.msmarco-v1-passage.aggretriever-cocondenser.20230407.f627ef.tar.gz", "urls": [ "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.aggretriever-cocondenser.20230407.f627ef.tar.gz" @@ -3144,7 +3157,7 @@ "texts": "msmarco-v1-passage" }, "msmarco-v1-passage.aggretriever-distilbert": { - "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-distilbert encoder.", + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-distilbert.", "filename": "faiss.msmarco-v1-passage.aggretriever-distilbert.20230407.f627ef.tar.gz", "urls": [ "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.aggretriever-distilbert.20230407.f627ef.tar.gz" diff --git a/tests/test_prebuilt_index.py b/tests/test_prebuilt_index.py index 9d1d634c7..3bc1a70c7 100644 --- a/tests/test_prebuilt_index.py +++ b/tests/test_prebuilt_index.py @@ -145,7 +145,7 @@ def test_faiss_msmarco(self): for url in FAISS_INDEX_INFO[key]['urls']: urls.append(url) - self.assertEqual(cnt, 15) + self.assertEqual(cnt, 16) self._test_urls(urls) def test_faiss_wikipedia(self):