diff --git a/docs/2cr/msmarco-v1-passage.html b/docs/2cr/msmarco-v1-passage.html index 5f356bd4f..457f64d9a 100644 --- a/docs/2cr/msmarco-v1-passage.html +++ b/docs/2cr/msmarco-v1-passage.html @@ -5941,11 +5941,120 @@
++Evaluation commands: + +python -m pyserini.search.faiss \ + --threads 16 --batch-size 512 \ + --index msmarco-v1-passage.openai-text-embedding-3-large \ + --topics dl19-passage --encoded-queries openai-text-embedding-3-large-dl19-passage \ + --output run.msmarco-v1-passage.openai-text-embedding-3-large.dl19.txt +
++ ++python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dl19.txt +python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dl19.txt +python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dl19.txt +
++Evaluation commands: + +python -m pyserini.search.faiss \ + --threads 16 --batch-size 512 \ + --index msmarco-v1-passage.openai-text-embedding-3-large \ + --topics dl20 --encoded-queries openai-text-embedding-3-large-dl20 \ + --output run.msmarco-v1-passage.openai-text-embedding-3-large.dl20.txt +
++ ++python -m pyserini.eval.trec_eval -c -l 2 -m map dl20-passage \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dl20.txt +python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl20-passage \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dl20.txt +python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl20-passage \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dl20.txt +
++Evaluation commands: + +python -m pyserini.search.faiss \ + --threads 16 --batch-size 512 \ + --index msmarco-v1-passage.openai-text-embedding-3-large \ + --topics msmarco-passage-dev-subset --encoded-queries openai-text-embedding-3-large-msmarco-passage-dev-subset \ + --output run.msmarco-v1-passage.openai-text-embedding-3-large.dev.txt +
++ ++python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dev.txt +python -m pyserini.eval.trec_eval -c -m recall.1000 msmarco-passage-dev-subset \ + run.msmarco-v1-passage.openai-text-embedding-3-large.dev.txt +
@@ -6005,7 +6114,7 @@MS MARCO V1 Passage
@@ -6029,7 +6138,7 @@MS MARCO V1 Passage
@@ -6058,7 +6167,7 @@MS MARCO V1 Passage
@@ -6120,7 +6229,7 @@MS MARCO V1 Passage
@@ -6146,7 +6255,7 @@MS MARCO V1 Passage
@@ -6177,7 +6286,7 @@MS MARCO V1 Passage
@@ -6236,7 +6345,7 @@MS MARCO V1 Passage
@@ -6259,7 +6368,7 @@MS MARCO V1 Passage
diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md index d038b232c..01c843abb 100644 --- a/docs/prebuilt-indexes.md +++ b/docs/prebuilt-indexes.md @@ -939,6 +939,10 @@ Detailed configuration information for the pre-built indexes are stored in [`pysmsmarco-v1-passage.cohere-embed-english-v3.0
- Faiss FlatIP index of the MS MARCO passage corpus encoded by Cohere Embed English v3.0
+msmarco-v1-passage.openai-text-embedding-3-large
+[readme] +- Faiss FlatIP index of the MS MARCO passage corpus encoded by OpenAI text-embedding-3-large +
msmarco-v1-doc.ance-maxp
- Faiss FlatIP index of the MS MARCO document corpus encoded by the ANCE MaxP encoder
diff --git a/pyserini/2cr/msmarco-v1-passage.yaml b/pyserini/2cr/msmarco-v1-passage.yaml index aadf12606..0a43e6b44 100644 --- a/pyserini/2cr/msmarco-v1-passage.yaml +++ b/pyserini/2cr/msmarco-v1-passage.yaml @@ -1201,6 +1201,29 @@ conditions: - MAP: 0.4938 nDCG@10: 0.6666 R@1K: 0.8919 + - name: openai-text-embedding-3-large + display: "OpenAI text-embedding-3-large: pre-encoded queries" + display-html: "OpenAI text-embedding-3-large: pre-encoded queries" + display-row: "" + command: python -m pyserini.search.faiss --threads ${dense_threads} --batch-size ${dense_batch_size} --index msmarco-v1-passage.openai-text-embedding-3-large --topics $topics --encoded-queries openai-text-embedding-3-large-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3342 + R@1K: 0.9885 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.5259 + nDCG@10: 0.7173 + R@1K: 0.8991 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.5134 + nDCG@10: 0.7163 + R@1K: 0.8884 - name: cohere-embed-english-v3.0 display: "Cohere Embed English v3.0: pre-encoded queries" display-html: "Cohere Embed English v3.0: pre-encoded queries" diff --git a/pyserini/2cr/msmarco.py b/pyserini/2cr/msmarco.py index eba823f75..fcde99c63 100644 --- a/pyserini/2cr/msmarco.py +++ b/pyserini/2cr/msmarco.py @@ -103,6 +103,7 @@ '', 'openai-ada2', 'openai-ada2-hyde', + 'openai-text-embedding-3-large', '', 'cosdpr-distil-pytorch', '', diff --git a/pyserini/encoded_query_info.py b/pyserini/encoded_query_info.py index ee0be9ac9..76843bc1e 100644 --- a/pyserini/encoded_query_info.py +++ b/pyserini/encoded_query_info.py @@ -515,6 +515,36 @@ "total_queries": 6980, "downloaded": False }, + "openai-text-embedding-3-large-dl19-passage": { + "description": "TREC DL19 passage queries encoded by OpenAI text-embedding-3-large.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-text-embedding-3-large-dl19-passage-20240410-c13cd6.tar.gz", + ], + "md5": "a2e4ad9dc3288d97b77577552df9ee2b", + "size (bytes)": 541753, + "total_queries": 43, + "downloaded": False + }, + "openai-text-embedding-3-large-dl20": { + "description": "TREC DL20 passage queries encoded by OpenAI text-embedding-3-large.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-text-embedding-3-large-dl20-passage-20240410-c13cd6.tar.gz", + ], + "md5": "9911fb1012ff5651f0cf832a81943967", + "size (bytes)": 2515768, + "total_queries": 200, + "downloaded": False + }, + "openai-text-embedding-3-large-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by OpenAI text-embedding-3-large.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-text-embedding-3-large-msmarco-passage-dev-subset-20240410-c13cd6.tar.gz", + ], + "md5": "4b0bce9c7cb0b55e49920d340924c92f", + "size (bytes)": 87687020, + "total_queries": 6980, + "downloaded": False + }, "atomic-v0.2.1-text-ViT-L-14.laion2b_s32b_b82k-validation": { "description": "AToMiC text v0.2.1 validation set encoded by ViT-L-14.laion2b_s32b_b82k.", "urls": [ diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index e9be97cfc..95d72d86d 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -3344,6 +3344,19 @@ "downloaded": False, "texts": "msmarco-v1-passage" }, + "msmarco-v1-passage.openai-text-embedding-3-large": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by OpenAI text-embedding-3-large", + "filename": "faiss-flat.msmarco-v1-passage.openai-text-embedding-3-large.20240410.c13cd6.tar.gz", + "readme": "faiss-flat.msmarco-v1-passage.openai-text-embedding-3-large.20240410.c13cd6.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss/faiss-flat.msmarco-v1-passage.openai-text-embedding-3-large.20240410.c13cd6.tar.gz" + ], + "md5": "e52f046b1decc9bf3a55ac0ff70780d0", + "size compressed (bytes)": 87658796879, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, "msmarco-v1-doc.ance-maxp": { "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by the ANCE MaxP encoder", "filename": "faiss.msmarco-v1-doc.ance_maxp.20210304.b2a1b0.tar.gz", diff --git a/pyserini/resources/index-metadata/faiss-flat.msmarco-v1-passage.openai-text-embedding-3-large.20240410.c13cd6.README.md b/pyserini/resources/index-metadata/faiss-flat.msmarco-v1-passage.openai-text-embedding-3-large.20240410.c13cd6.README.md new file mode 100644 index 000000000..6c511f132 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.msmarco-v1-passage.openai-text-embedding-3-large.20240410.c13cd6.README.md @@ -0,0 +1,8 @@ +# msmarco-v1-passage.openai-text-embedding-3-large + +Faiss FlatIP index of the MS MARCO passage corpus, msmarco-v1-passage, encoded by OpenAI text-embedding-3-large +This index was generated on 2024/04/10 on `orca` at commit: + ++ Pyserini commit [`c13cd6`](https://github.com/castorini/pyserini/commit/c13cd630136c7290ee95ee2cba74aeee3c5cbe07) (2024/04/10) + +The corpora was encoded through the Azure OpenAI Service. Embeddings have 3072 dimensions. \ No newline at end of file diff --git a/tests/test_prebuilt_index.py b/tests/test_prebuilt_index.py index 1b980e6f3..2c84038e7 100644 --- a/tests/test_prebuilt_index.py +++ b/tests/test_prebuilt_index.py @@ -158,7 +158,7 @@ def test_faiss_msmarco(self): for url in FAISS_INDEX_INFO[key]['urls']: urls.append(url) - self.assertEqual(cnt, 18) + self.assertEqual(cnt, 19) self._test_urls(urls) def test_faiss_ciral(self):