From 442e7e1026728f29cc3a9d3e684c561637ad1d7b Mon Sep 17 00:00:00 2001 From: Manveer Tamber Date: Thu, 22 Feb 2024 21:45:02 -0500 Subject: [PATCH] Add cohere-embed-english-v3.0 to msmarco-v1-passage 2cr (#1794) --- docs/2cr/msmarco-v1-passage.html | 124 +++++++++++++++++++++++++-- docs/prebuilt-indexes.md | 5 +- pyserini/2cr/msmarco-v1-passage.yaml | 25 +++++- pyserini/2cr/msmarco.py | 2 + pyserini/encoded_query_info.py | 30 +++++++ pyserini/prebuilt_index_info.py | 15 +++- 6 files changed, 192 insertions(+), 9 deletions(-) diff --git a/docs/2cr/msmarco-v1-passage.html b/docs/2cr/msmarco-v1-passage.html index e405e3054..0b4782b60 100644 --- a/docs/2cr/msmarco-v1-passage.html +++ b/docs/2cr/msmarco-v1-passage.html @@ -6062,16 +6062,16 @@

MS MARCO V1 Passage

[14] BGE-base-en-v1.5: PyTorch -0.4435 -0.7065 +0.4436 +0.7055 0.8472 -0.4650 +0.4651 0.6780 0.8503 -0.3896 -0.9796 +0.3557 +0.9814 @@ -6173,6 +6173,116 @@

MS MARCO V1 Passage

+ + + + + + + +Cohere Embed English v3.0: pre-encoded queries +0.4884 +0.6956 +0.8630 + +0.5067 +0.7245 +0.8682 + +0.3660 +0.9785 + + + + +
+ + + + + + +
+
+Command to generate run on TREC 2019 queries: + +
+
python -m pyserini.search.faiss \
+  --threads 16 --batch-size 512 \
+  --index msmarco-v1-passage.cohere-embed-english-v3.0 \
+  --topics dl19-passage --encoded-queries cohere-embed-english-v3.0-dl19-passage \
+  --output run.msmarco-v1-passage.cohere-embed-english-v3.0.dl19.txt
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dl19.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dl19.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dl19.txt
+
+
+ +
+
+ Command to generate run on TREC 2020 queries: + +
+
python -m pyserini.search.faiss \
+  --threads 16 --batch-size 512 \
+  --index msmarco-v1-passage.cohere-embed-english-v3.0 \
+  --topics dl20 --encoded-queries cohere-embed-english-v3.0-dl20 \
+  --output run.msmarco-v1-passage.cohere-embed-english-v3.0.dl20.txt
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dl20.txt
+python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dl20.txt
+python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dl20.txt
+
+
+ +
+
+ Command to generate run on dev queries: + +
+
python -m pyserini.search.faiss \
+  --threads 16 --batch-size 512 \
+  --index msmarco-v1-passage.cohere-embed-english-v3.0 \
+  --topics msmarco-passage-dev-subset --encoded-queries cohere-embed-english-v3.0-msmarco-passage-dev-subset \
+  --output run.msmarco-v1-passage.cohere-embed-english-v3.0.dev.txt
+
+Evaluation commands: + +
+
python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dev.txt
+python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \
+  run.msmarco-v1-passage.cohere-embed-english-v3.0.dev.txt
+
+
+ +
+
+ +
@@ -6236,6 +6346,10 @@

MS MARCO V1 Passage

Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes. Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023), October 2023, pages 5366–5370, Birmingham, the United Kingdom.

+
  • [14] Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. +C-Pack: Packaged Resources To Advance General Chinese Embedding. +arXiv:2309.07597, December 2023.

  • +
    diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md index 51aacb7b4..0c114164c 100644 --- a/docs/prebuilt-indexes.md +++ b/docs/prebuilt-indexes.md @@ -1069,7 +1069,10 @@ Detailed configuration information for the pre-built indexes are stored in [`pys
    Faiss FlatIP index of the MS MARCO passage corpus encoded by the tct_colbert-v2-hnp passage encoder
    msmarco-v1-passage.openai-ada2 -
    Faiss FlatIP index of the MS MARCO document corpus encoded by TCT-ColBERT-V2-HNP +
    Faiss FlatIP index of the MS MARCO passage corpus encoded by OpenAI ada2 +
    +
    msmarco-v1-passage.cohere-embed-english-v3.0 +
    Faiss FlatIP index of the MS MARCO passage corpus encoded by Cohere Embed English v3.0
    msmarco-v1-doc.ance-maxp
    Faiss FlatIP index of the MS MARCO document corpus encoded by the ANCE MaxP encoder diff --git a/pyserini/2cr/msmarco-v1-passage.yaml b/pyserini/2cr/msmarco-v1-passage.yaml index 432dec9e2..e59027cc7 100644 --- a/pyserini/2cr/msmarco-v1-passage.yaml +++ b/pyserini/2cr/msmarco-v1-passage.yaml @@ -1200,4 +1200,27 @@ conditions: scores: - MAP: 0.4938 nDCG@10: 0.6666 - R@1K: 0.8919 \ No newline at end of file + R@1K: 0.8919 + - name: cohere-embed-english-v3.0 + display: "Cohere Embed English v3.0: pre-encoded queries" + display-html: "Cohere Embed English v3.0: pre-encoded queries" + display-row: "" + command: python -m pyserini.search.faiss --threads ${dense_threads} --batch-size ${dense_batch_size} --index msmarco-v1-passage.cohere-embed-english-v3.0 --topics $topics --encoded-queries cohere-embed-english-v3.0-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3660 + R@1K: 0.9785 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4884 + nDCG@10: 0.6956 + R@1K: 0.8630 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.5067 + nDCG@10: 0.7245 + R@1K: 0.8682 \ No newline at end of file diff --git a/pyserini/2cr/msmarco.py b/pyserini/2cr/msmarco.py index bbad7863d..2cb0e7352 100644 --- a/pyserini/2cr/msmarco.py +++ b/pyserini/2cr/msmarco.py @@ -107,6 +107,8 @@ 'cosdpr-distil-pytorch', '', 'bge-base-en-v1.5-pytorch', + '', + 'cohere-embed-english-v3.0', ], # MS MARCO v1 doc diff --git a/pyserini/encoded_query_info.py b/pyserini/encoded_query_info.py index f78b53c4c..ee0be9ac9 100644 --- a/pyserini/encoded_query_info.py +++ b/pyserini/encoded_query_info.py @@ -485,6 +485,36 @@ "total_queries": 6980, "downloaded": False }, + "cohere-embed-english-v3.0-dl19-passage": { + "description": "TREC DL19 passage queries encoded by Cohere Embed English v3.0.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-cohere-embed-english-v3.0-dl19-passage-20240216-2154e79.tar.gz", + ], + "md5": "04300156fe6be309b8d83270dbb328c6", + "size (bytes)": 141545, + "total_queries": 43, + "downloaded": False + }, + "cohere-embed-english-v3.0-dl20": { + "description": "TREC DL20 passage queries encoded by Cohere Embed English v3.0.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-cohere-embed-english-v3.0-dl20-passage-20240216-2154e79.tar.gz", + ], + "md5": "0b12d7049ba46f1ebe1ae07f0e7c1723", + "size (bytes)": 646705, + "total_queries": 200, + "downloaded": False + }, + "cohere-embed-english-v3.0-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by Cohere Embed English v3.0.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-cohere-embed-english-v3.0-msmarco-passage-dev-subset-20240216-2154e79.tar.gz", + ], + "md5": "7dd0026490117e9e6f6acfc110d6ce83", + "size (bytes)": 22377230, + "total_queries": 6980, + "downloaded": False + }, "atomic-v0.2.1-text-ViT-L-14.laion2b_s32b_b82k-validation": { "description": "AToMiC text v0.2.1 validation set encoded by ViT-L-14.laion2b_s32b_b82k.", "urls": [ diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 17cf48d16..40fe2ff5c 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -3756,7 +3756,7 @@ "texts": "msmarco-v1-passage" }, "msmarco-v1-passage.openai-ada2": { - "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by TCT-ColBERT-V2-HNP", + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by OpenAI ada2", "filename": "faiss.msmarco-v1-passage.openai-ada2.20230530.e3a58f.tar.gz", "urls": [ "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.openai-ada2.20230530.e3a58f.tar.gz" @@ -3767,7 +3767,18 @@ "downloaded": False, "texts": "msmarco-v1-passage" }, - + "msmarco-v1-passage.cohere-embed-english-v3.0": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by Cohere Embed English v3.0", + "filename": "faiss.msmarco-v1-passage.cohere-embed-english-v3.0.20240216.2154e79.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.cohere-embed-english-v3.0.20240216.2154e79.tar.gz" + ], + "md5": "df0d8e2aac71fb3ee8b554bdcf158f95", + "size compressed (bytes)": 21341576907, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, "msmarco-v1-doc.ance-maxp": { "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by the ANCE MaxP encoder", "filename": "faiss.msmarco-v1-doc.ance_maxp.20210304.b2a1b0.tar.gz",