diff --git a/docs/2cr/miracl.html b/docs/2cr/miracl.html
index 10a58f99d..19588ce62 100644
--- a/docs/2cr/miracl.html
+++ b/docs/2cr/miracl.html
@@ -655,13 +655,13 @@
MIRACL
|
mDPR (tied encoders), pre-FT w/ MS MARCO |
0.499 |
-0.462 |
+0.443 |
0.394 |
0.478 |
0.480 |
0.472 |
0.435 |
-0.390 |
+0.383 |
0.272 |
0.439 |
0.419 |
@@ -673,7 +673,7 @@ MIRACL
0.490 |
0.444 |
|
-0.422 |
+0.421 |
@@ -1136,13 +1136,13 @@ MIRACL
| |
mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi |
0.578 |
-0.592 |
+0.580 |
0.281 |
0.251 |
0.384 |
0.569 |
0.301 |
-0.332 |
+0.329 |
0.346 |
0.500 |
0.486 |
@@ -1154,7 +1154,7 @@ MIRACL
0.322 |
0.598 |
|
-0.463 |
+0.462 |
@@ -1617,13 +1617,13 @@ MIRACL
| |
Hybrid of `bm25` and `mdpr-tied-pft-msmarco` |
0.673 |
-0.671 |
+0.654 |
0.549 |
0.641 |
0.594 |
0.672 |
0.523 |
-0.615 |
+0.616 |
0.443 |
0.576 |
0.609 |
@@ -1635,7 +1635,7 @@ MIRACL
0.564 |
0.611 |
|
-0.580 |
+0.579 |
@@ -3486,13 +3486,13 @@ MIRACL
| |
mDPR (tied encoders), pre-FT w/ MS MARCO |
0.841 |
-0.831 |
+0.819 |
0.768 |
0.864 |
0.898 |
0.788 |
0.915 |
-0.781 |
+0.776 |
0.573 |
0.825 |
0.737 |
@@ -3504,7 +3504,7 @@ MIRACL
0.898 |
0.840 |
|
-0.798 |
+0.797 |
@@ -3967,13 +3967,13 @@ MIRACL
| |
mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi |
0.795 |
-0.866 |
+0.848 |
0.508 |
0.471 |
0.686 |
0.798 |
0.601 |
-0.635 |
+0.637 |
0.584 |
0.745 |
0.718 |
@@ -3985,7 +3985,7 @@ MIRACL
0.599 |
0.891 |
|
-0.718 |
+0.717 |
@@ -4448,13 +4448,13 @@ MIRACL
| |
Hybrid of `bm25` and `mdpr-tied-pft-msmarco` |
0.941 |
-0.949 |
+0.932 |
0.882 |
0.948 |
0.937 |
0.895 |
0.965 |
-0.915 |
+0.912 |
0.768 |
0.904 |
0.900 |
@@ -4466,7 +4466,7 @@ MIRACL
0.948 |
0.950 |
|
-0.897 |
+0.895 |
diff --git a/docs/2cr/mrtydi.html b/docs/2cr/mrtydi.html
index 012cf4fd5..0d16bf504 100644
--- a/docs/2cr/mrtydi.html
+++ b/docs/2cr/mrtydi.html
@@ -746,7 +746,7 @@ Mr.TyDi
| |
mDPR (tied encoders), pre-FT w/ NQ |
0.221 |
-0.255 |
+0.254 |
0.243 |
0.244 |
0.281 |
@@ -1046,7 +1046,7 @@ Mr.TyDi
|
mDPR (tied encoders), pre-FT w/ MS MARCO |
0.441 |
-0.417 |
+0.397 |
0.327 |
0.275 |
0.352 |
@@ -1057,7 +1057,7 @@ Mr.TyDi
0.310 |
0.269 |
|
-0.335 |
+0.333 |
|
@@ -1346,7 +1346,7 @@ Mr.TyDi
|
mDPR (tied encoders), pre-FT w/ MS MARCO, FT w/ all |
0.695 |
-0.643 |
+0.623 |
0.492 |
0.559 |
0.578 |
@@ -1357,7 +1357,7 @@ Mr.TyDi
0.891 |
0.618 |
|
-0.602 |
+0.600 |
|
@@ -2251,7 +2251,7 @@ Mr.TyDi
|
mDPR (tied encoders), pre-FT w/ NQ |
0.600 |
-0.716 |
+0.707 |
0.689 |
0.640 |
0.691 |
@@ -2262,7 +2262,7 @@ Mr.TyDi
0.245 |
0.455 |
|
-0.580 |
+0.579 |
|
@@ -2551,7 +2551,7 @@ Mr.TyDi
|
mDPR (tied encoders), pre-FT w/ MS MARCO |
0.797 |
-0.820 |
+0.784 |
0.754 |
0.647 |
0.736 |
@@ -2562,7 +2562,7 @@ Mr.TyDi
0.782 |
0.595 |
|
-0.714 |
+0.711 |
|
diff --git a/docs/2cr/msmarco-v1-passage.html b/docs/2cr/msmarco-v1-passage.html
index 21ea2a719..bd44639db 100644
--- a/docs/2cr/msmarco-v1-passage.html
+++ b/docs/2cr/msmarco-v1-passage.html
@@ -5747,7 +5747,7 @@ MS MARCO V1 Passage
|
- |
+[11] |
OpenAI ada2: pre-encoded queries |
0.4788 |
0.7035 |
@@ -5856,7 +5856,7 @@ MS MARCO V1 Passage
|
- |
+[12] |
HyDE-OpenAI ada2: pre-encoded queries |
0.5125 |
0.7163 |
@@ -5941,6 +5941,119 @@ MS MARCO V1 Passage
+
+
+ |
+
+
+ |
+[13] |
+cosDPR-distil: PyTorch |
+0.4656 |
+0.7250 |
+0.8201 |
+ |
+0.4876 |
+0.7025 |
+0.8533 |
+ |
+0.3896 |
+0.9796 |
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+Command to generate run on TREC 2019 queries:
+
+
+python -m pyserini.search.faiss \
+ --threads 16 --batch-size 512 \
+ --index msmarco-v1-passage.cosdpr-distil \
+ --topics dl19-passage \
+ --encoder castorini/cosdpr-distil \
+ --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+
+Evaluation commands:
+
+
+python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dl19.txt
+
+
+
+
+
+ Command to generate run on TREC 2020 queries:
+
+
+python -m pyserini.search.faiss \
+ --threads 16 --batch-size 512 \
+ --index msmarco-v1-passage.cosdpr-distil \
+ --topics dl20 \
+ --encoder castorini/cosdpr-distil \
+ --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+
+Evaluation commands:
+
+
+python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dl20.txt
+
+
+
+
+
+ Command to generate run on dev queries:
+
+
+python -m pyserini.search.faiss \
+ --threads 16 --batch-size 512 \
+ --index msmarco-v1-passage.cosdpr-distil \
+ --topics msmarco-passage-dev-subset \
+ --encoder castorini/cosdpr-distil \
+ --output run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+
+Evaluation commands:
+
+
+python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \
+ run.msmarco-v1-passage.cosdpr-distil-pytorch.dev.txt
+
+
+
+
+
+
+
|
@@ -5992,6 +6105,18 @@ MS MARCO V1 Passage
Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), 2019.
+[11] Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian.
+Vector Search with OpenAI Embeddings: Lucene Is All You Need.
+arXiv:2308.14963, August 2023.
+
+[12] Luyu Gao, Xueguang Ma, Jimmy Lin, and Jamie Callan.
+Precise Zero-Shot Dense Retrieval without Relevance Labels.
+Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1762-1777, July 2023, Toronto, Canada.
+
+[13] Xueguang Ma, Tommaso Teofili, and Jimmy Lin.
+Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.
+Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023), October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
diff --git a/pyserini/2cr/msmarco-v1-passage.yaml b/pyserini/2cr/msmarco-v1-passage.yaml
index 8389603a9..aa4bdba91 100644
--- a/pyserini/2cr/msmarco-v1-passage.yaml
+++ b/pyserini/2cr/msmarco-v1-passage.yaml
@@ -1,4 +1,27 @@
conditions:
+ - name: cosdpr-distil-pytorch
+ display: "cosDPR-distil: PyTorch"
+ display-html: "cosDPR-distil: PyTorch"
+ display-row: "[13]"
+ command: python -m pyserini.search.faiss --threads ${dense_threads} --batch-size ${dense_batch_size} --index msmarco-v1-passage.cosdpr-distil --topics $topics --encoder castorini/cosdpr-distil --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3896
+ R@1K: 0.9796
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4656
+ nDCG@10: 0.7250
+ R@1K: 0.8201
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4876
+ nDCG@10: 0.7025
+ R@1K: 0.8533
- name: splade-pp-ed-rocchio-pytorch
display: "SPLADE++ EnsembleDistil w/ Rocchio: PyTorch"
display-html: "SPLADE++ EnsembleDistil w/ Rocchio: PyTorch"
@@ -1117,6 +1140,7 @@ conditions:
- name: openai-ada2
display: "OpenAI ada2: pre-encoded queries"
display-html: "OpenAI ada2: pre-encoded queries"
+ display-row: "[11]"
command: python -m pyserini.search.faiss --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics --output $output
topics:
- topic_key: msmarco-passage-dev-subset
@@ -1139,6 +1163,7 @@ conditions:
- name: openai-ada2-hyde
display: "HyDE-OpenAI ada2: pre-encoded queries"
display-html: "HyDE-OpenAI ada2: pre-encoded queries"
+ display-row: "[12]"
command: python -m pyserini.search.faiss --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics-hyde --output $output
topics:
- topic_key: dl19-passage
diff --git a/pyserini/2cr/msmarco.py b/pyserini/2cr/msmarco.py
index cf4d6a507..bb732d319 100644
--- a/pyserini/2cr/msmarco.py
+++ b/pyserini/2cr/msmarco.py
@@ -102,7 +102,10 @@
'aggretriever-cocondenser-pytorch',
'',
'openai-ada2',
- 'openai-ada2-hyde'],
+ 'openai-ada2-hyde',
+ '',
+ 'cosdpr-distil-pytorch'
+ ],
# MS MARCO v1 doc
'msmarco-v1-doc':
diff --git a/pyserini/2cr/msmarco_html_v1_passage.template b/pyserini/2cr/msmarco_html_v1_passage.template
index 19e5bf6e4..80fe5cfbb 100644
--- a/pyserini/2cr/msmarco_html_v1_passage.template
+++ b/pyserini/2cr/msmarco_html_v1_passage.template
@@ -223,6 +223,18 @@ $rows
Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), 2019.
+[11] Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian.
+Vector Search with OpenAI Embeddings: Lucene Is All You Need.
+arXiv:2308.14963, August 2023.
+
+[12] Luyu Gao, Xueguang Ma, Jimmy Lin, and Jamie Callan.
+Precise Zero-Shot Dense Retrieval without Relevance Labels.
+Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1762-1777, July 2023, Toronto, Canada.
+
+[13] Xueguang Ma, Tommaso Teofili, and Jimmy Lin.
+Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.
+Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023), October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py
index 29a017154..940738352 100644
--- a/pyserini/prebuilt_index_info.py
+++ b/pyserini/prebuilt_index_info.py
@@ -3130,9 +3130,22 @@
**IMPACT_INDEX_INFO_BEIR}
FAISS_INDEX_INFO_MSMARCO = {
+ "msmarco-v1-passage.cosdpr-distil": {
+ "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by cosDPR-distil.",
+ "filename": "faiss.msmarco-v1-passage.cosdpr-distil.20221023.tar.gz",
+ "urls": [
+ "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.cosdpr-distil.20221023.tar.gz"
+ ],
+ "md5": "02018b5797bf1e7ebe6e2f552319696a",
+ "size compressed (bytes)": 23843194944,
+ "documents": 8841823,
+ "downloaded": False,
+ "texts": "msmarco-v1-passage"
+ },
+
# Aggretriever indexes
"msmarco-v1-passage.aggretriever-cocondenser": {
- "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-cocondenser encoder.",
+ "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-cocondenser.",
"filename": "faiss.msmarco-v1-passage.aggretriever-cocondenser.20230407.f627ef.tar.gz",
"urls": [
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.aggretriever-cocondenser.20230407.f627ef.tar.gz"
@@ -3144,7 +3157,7 @@
"texts": "msmarco-v1-passage"
},
"msmarco-v1-passage.aggretriever-distilbert": {
- "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-distilbert encoder.",
+ "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-distilbert.",
"filename": "faiss.msmarco-v1-passage.aggretriever-distilbert.20230407.f627ef.tar.gz",
"urls": [
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.aggretriever-distilbert.20230407.f627ef.tar.gz"
diff --git a/tests/test_prebuilt_index.py b/tests/test_prebuilt_index.py
index 9d1d634c7..3bc1a70c7 100644
--- a/tests/test_prebuilt_index.py
+++ b/tests/test_prebuilt_index.py
@@ -145,7 +145,7 @@ def test_faiss_msmarco(self):
for url in FAISS_INDEX_INFO[key]['urls']:
urls.append(url)
- self.assertEqual(cnt, 15)
+ self.assertEqual(cnt, 16)
self._test_urls(urls)
def test_faiss_wikipedia(self):