From a6f4d6a893aa48aac340fcceb97b0dda7d84b491 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Mon, 27 May 2024 19:54:06 -0400 Subject: [PATCH] Repackage prebuilt indexes for MS MARCO V1 LTR + Mr.TyDi (Lucene) (#1895) Adopts new, consistent schema for naming --- docs/2cr/mrtydi.html | 44 ++-- docs/2cr/msmarco-v2-passage.html | 90 ++++---- docs/prebuilt-indexes.md | 58 +++--- .../sparse/test_lucenesearcher_check_irst.py | 1 - ...cenesearcher_check_ltr_msmarco_document.py | 2 +- ...ucenesearcher_check_ltr_msmarco_passage.py | 2 +- pyserini/2cr/mrtydi.yaml | 22 +- pyserini/prebuilt_index_info.py | 193 +++++++++--------- ...tydi-v1.1-arabic.20220108.6fcb89.README.md | 13 -- ...tydi-v1.1-arabic.20220928.b5ecc5.README.md | 17 -- ...ydi-v1.1-bengali.20220108.6fcb89.README.md | 13 -- ...ydi-v1.1-bengali.20220928.b5ecc5.README.md | 17 -- ...ydi-v1.1-english.20220108.6fcb89.README.md | 13 -- ...ydi-v1.1-english.20220928.b5ecc5.README.md | 17 -- ...ydi-v1.1-finnish.20220108.6fcb89.README.md | 13 -- ...ydi-v1.1-finnish.20220928.b5ecc5.README.md | 17 -- ...-v1.1-indonesian.20220108.6fcb89.README.md | 13 -- ...-v1.1-indonesian.20220928.b5ecc5.README.md | 17 -- ...di-v1.1-japanese.20220108.6fcb89.README.md | 13 -- ...di-v1.1-japanese.20220928.b5ecc5.README.md | 17 -- ...tydi-v1.1-korean.20220108.6fcb89.README.md | 13 -- ...tydi-v1.1-korean.20220928.b5ecc5.README.md | 17 -- ...ydi-v1.1-russian.20220108.6fcb89.README.md | 13 -- ...ydi-v1.1-russian.20220928.b5ecc5.README.md | 17 -- ...ydi-v1.1-swahili.20220108.6fcb89.README.md | 16 -- ...ydi-v1.1-swahili.20220928.b5ecc5.README.md | 17 -- ...tydi-v1.1-telugu.20220108.6fcb89.README.md | 16 -- ...tydi-v1.1-telugu.20220928.b5ecc5.README.md | 17 -- ...mrtydi-v1.1-thai.20220108.6fcb89.README.md | 13 -- ...mrtydi-v1.1-thai.20220928.b5ecc5.README.md | 17 -- ...rted.mrtydi-v1.1.20220928.b5ecc5.README.md | 160 +++++++++++++++ ...segmented.ltr.20211031.33e4151.README.txt} | 0 ...1-passage.ltr.20210519.e25e33f.README.txt} | 0 pyserini/search/lucene/ltr/__main__.py | 62 +++--- pyserini/search/lucene/ltr/_search_msmarco.py | 5 +- tests/test_prebuilt_index.py | 4 +- 36 files changed, 399 insertions(+), 580 deletions(-) delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md delete mode 100644 pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md create mode 100644 pyserini/resources/index-metadata/lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md rename pyserini/resources/index-metadata/{index-msmarco-doc-per-passage-ltr-readme.txt => lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.README.txt} (100%) rename pyserini/resources/index-metadata/{index-msmarco-passage-ltr-20210519-e25e33f-readme.txt => lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.README.txt} (100%) diff --git a/docs/2cr/mrtydi.html b/docs/2cr/mrtydi.html index 0d16bf504..507b6f946 100644 --- a/docs/2cr/mrtydi.html +++ b/docs/2cr/mrtydi.html @@ -235,7 +235,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ar \ --topics mrtydi-v1.1-arabic-test \ - --index mrtydi-v1.1-arabic \ + --index mrtydi-v1.1-ar \ --output run.mrtydi.bm25.ar.test.txt --bm25 --hits 100 Evaluation commands: @@ -255,7 +255,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language bn \ --topics mrtydi-v1.1-bengali-test \ - --index mrtydi-v1.1-bengali \ + --index mrtydi-v1.1-bn \ --output run.mrtydi.bm25.bn.test.txt --bm25 --hits 100 Evaluation commands: @@ -275,7 +275,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language en \ --topics mrtydi-v1.1-english-test \ - --index mrtydi-v1.1-english \ + --index mrtydi-v1.1-en \ --output run.mrtydi.bm25.en.test.txt --bm25 --hits 100 Evaluation commands: @@ -295,7 +295,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language fi \ --topics mrtydi-v1.1-finnish-test \ - --index mrtydi-v1.1-finnish \ + --index mrtydi-v1.1-fi \ --output run.mrtydi.bm25.fi.test.txt --bm25 --hits 100 Evaluation commands: @@ -315,7 +315,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language id \ --topics mrtydi-v1.1-indonesian-test \ - --index mrtydi-v1.1-indonesian \ + --index mrtydi-v1.1-id \ --output run.mrtydi.bm25.id.test.txt --bm25 --hits 100 Evaluation commands: @@ -335,7 +335,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ja \ --topics mrtydi-v1.1-japanese-test \ - --index mrtydi-v1.1-japanese \ + --index mrtydi-v1.1-ja \ --output run.mrtydi.bm25.ja.test.txt --bm25 --hits 100 Evaluation commands: @@ -355,7 +355,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ko \ --topics mrtydi-v1.1-korean-test \ - --index mrtydi-v1.1-korean \ + --index mrtydi-v1.1-ko \ --output run.mrtydi.bm25.ko.test.txt --bm25 --hits 100 Evaluation commands: @@ -375,7 +375,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ru \ --topics mrtydi-v1.1-russian-test \ - --index mrtydi-v1.1-russian \ + --index mrtydi-v1.1-ru \ --output run.mrtydi.bm25.ru.test.txt --bm25 --hits 100 Evaluation commands: @@ -395,7 +395,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language sw \ --topics mrtydi-v1.1-swahili-test \ - --index mrtydi-v1.1-swahili \ + --index mrtydi-v1.1-sw \ --output run.mrtydi.bm25.sw.test.txt --bm25 --hits 100 Evaluation commands: @@ -415,7 +415,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language te \ --topics mrtydi-v1.1-telugu-test \ - --index mrtydi-v1.1-telugu \ + --index mrtydi-v1.1-te \ --output run.mrtydi.bm25.te.test.txt --bm25 --hits 100 Evaluation commands: @@ -435,7 +435,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language th \ --topics mrtydi-v1.1-thai-test \ - --index mrtydi-v1.1-thai \ + --index mrtydi-v1.1-th \ --output run.mrtydi.bm25.th.test.txt --bm25 --hits 100 Evaluation commands: @@ -1740,7 +1740,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ar \ --topics mrtydi-v1.1-arabic-test \ - --index mrtydi-v1.1-arabic \ + --index mrtydi-v1.1-ar \ --output run.mrtydi.bm25.ar.test.txt --bm25 --hits 100 Evaluation commands: @@ -1760,7 +1760,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language bn \ --topics mrtydi-v1.1-bengali-test \ - --index mrtydi-v1.1-bengali \ + --index mrtydi-v1.1-bn \ --output run.mrtydi.bm25.bn.test.txt --bm25 --hits 100 Evaluation commands: @@ -1780,7 +1780,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language en \ --topics mrtydi-v1.1-english-test \ - --index mrtydi-v1.1-english \ + --index mrtydi-v1.1-en \ --output run.mrtydi.bm25.en.test.txt --bm25 --hits 100 Evaluation commands: @@ -1800,7 +1800,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language fi \ --topics mrtydi-v1.1-finnish-test \ - --index mrtydi-v1.1-finnish \ + --index mrtydi-v1.1-fi \ --output run.mrtydi.bm25.fi.test.txt --bm25 --hits 100 Evaluation commands: @@ -1820,7 +1820,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language id \ --topics mrtydi-v1.1-indonesian-test \ - --index mrtydi-v1.1-indonesian \ + --index mrtydi-v1.1-id \ --output run.mrtydi.bm25.id.test.txt --bm25 --hits 100 Evaluation commands: @@ -1840,7 +1840,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ja \ --topics mrtydi-v1.1-japanese-test \ - --index mrtydi-v1.1-japanese \ + --index mrtydi-v1.1-ja \ --output run.mrtydi.bm25.ja.test.txt --bm25 --hits 100 Evaluation commands: @@ -1860,7 +1860,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ko \ --topics mrtydi-v1.1-korean-test \ - --index mrtydi-v1.1-korean \ + --index mrtydi-v1.1-ko \ --output run.mrtydi.bm25.ko.test.txt --bm25 --hits 100 Evaluation commands: @@ -1880,7 +1880,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language ru \ --topics mrtydi-v1.1-russian-test \ - --index mrtydi-v1.1-russian \ + --index mrtydi-v1.1-ru \ --output run.mrtydi.bm25.ru.test.txt --bm25 --hits 100 Evaluation commands: @@ -1900,7 +1900,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language sw \ --topics mrtydi-v1.1-swahili-test \ - --index mrtydi-v1.1-swahili \ + --index mrtydi-v1.1-sw \ --output run.mrtydi.bm25.sw.test.txt --bm25 --hits 100 Evaluation commands: @@ -1920,7 +1920,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language te \ --topics mrtydi-v1.1-telugu-test \ - --index mrtydi-v1.1-telugu \ + --index mrtydi-v1.1-te \ --output run.mrtydi.bm25.te.test.txt --bm25 --hits 100 Evaluation commands: @@ -1940,7 +1940,7 @@

Mr.TyDi

--threads 16 --batch-size 128 \ --language th \ --topics mrtydi-v1.1-thai-test \ - --index mrtydi-v1.1-thai \ + --index mrtydi-v1.1-th \ --output run.mrtydi.bm25.th.test.txt --bm25 --hits 100 Evaluation commands: diff --git a/docs/2cr/msmarco-v2-passage.html b/docs/2cr/msmarco-v2-passage.html index d335ea256..fc5781d47 100644 --- a/docs/2cr/msmarco-v2-passage.html +++ b/docs/2cr/msmarco-v2-passage.html @@ -875,7 +875,7 @@

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5 \
+  --index msmarco-v2-passage.d2q-t5 \
   --topics dl21 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-default.dl21.txt \
   --bm25
@@ -896,7 +896,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5 \
+  --index msmarco-v2-passage.d2q-t5 \
   --topics dl22 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-default.dl22.txt \
   --bm25
@@ -917,7 +917,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5 \
+  --index msmarco-v2-passage.d2q-t5 \
   --topics dl23 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-default.dl23.txt \
   --bm25
@@ -938,7 +938,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5 \
+  --index msmarco-v2-passage.d2q-t5 \
   --topics msmarco-v2-passage-dev \
   --output run.msmarco-v2-passage.bm25-d2q-t5-default.dev.txt \
   --bm25
@@ -958,7 +958,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5 \
+  --index msmarco-v2-passage.d2q-t5 \
   --topics msmarco-v2-passage-dev2 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-default.dev2.txt \
   --bm25
@@ -1033,7 +1033,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5 \
+  --index msmarco-v2-passage-augmented.d2q-t5 \
   --topics dl21 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-augmented-default.dl21.txt \
   --bm25
@@ -1054,7 +1054,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5 \
+  --index msmarco-v2-passage-augmented.d2q-t5 \
   --topics dl22 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-augmented-default.dl22.txt \
   --bm25
@@ -1075,7 +1075,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5 \
+  --index msmarco-v2-passage-augmented.d2q-t5 \
   --topics dl23 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-augmented-default.dl23.txt \
   --bm25
@@ -1096,7 +1096,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5 \
+  --index msmarco-v2-passage-augmented.d2q-t5 \
   --topics msmarco-v2-passage-dev \
   --output run.msmarco-v2-passage.bm25-d2q-t5-augmented-default.dev.txt \
   --bm25
@@ -1116,7 +1116,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5 \
+  --index msmarco-v2-passage-augmented.d2q-t5 \
   --topics msmarco-v2-passage-dev2 \
   --output run.msmarco-v2-passage.bm25-d2q-t5-augmented-default.dev2.txt \
   --bm25
@@ -1191,7 +1191,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5-docvectors \
+  --index msmarco-v2-passage.d2q-t5-docvectors \
   --topics dl21 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-default.dl21.txt \
   --bm25 --rm3
@@ -1212,7 +1212,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5-docvectors \
+  --index msmarco-v2-passage.d2q-t5-docvectors \
   --topics dl22 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-default.dl22.txt \
   --bm25 --rm3
@@ -1233,7 +1233,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5-docvectors \
+  --index msmarco-v2-passage.d2q-t5-docvectors \
   --topics dl23 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-default.dl23.txt \
   --bm25 --rm3
@@ -1254,7 +1254,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5-docvectors \
+  --index msmarco-v2-passage.d2q-t5-docvectors \
   --topics msmarco-v2-passage-dev \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-default.dev.txt \
   --bm25 --rm3
@@ -1274,7 +1274,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-d2q-t5-docvectors \
+  --index msmarco-v2-passage.d2q-t5-docvectors \
   --topics msmarco-v2-passage-dev2 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-default.dev2.txt \
   --bm25 --rm3
@@ -1349,7 +1349,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5-docvectors \
+  --index msmarco-v2-passage-augmented.d2q-t5-docvectors \
   --topics dl21 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-augmented-default.dl21.txt \
   --bm25 --rm3
@@ -1370,7 +1370,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5-docvectors \
+  --index msmarco-v2-passage-augmented.d2q-t5-docvectors \
   --topics dl22 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-augmented-default.dl22.txt \
   --bm25 --rm3
@@ -1391,7 +1391,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5-docvectors \
+  --index msmarco-v2-passage-augmented.d2q-t5-docvectors \
   --topics dl23 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-augmented-default.dl23.txt \
   --bm25 --rm3
@@ -1412,7 +1412,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5-docvectors \
+  --index msmarco-v2-passage-augmented.d2q-t5-docvectors \
   --topics msmarco-v2-passage-dev \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-augmented-default.dev.txt \
   --bm25 --rm3
@@ -1432,7 +1432,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-augmented-d2q-t5-docvectors \
+  --index msmarco-v2-passage-augmented.d2q-t5-docvectors \
   --topics msmarco-v2-passage-dev2 \
   --output run.msmarco-v2-passage.bm25-rm3-d2q-t5-augmented-default.dev2.txt \
   --bm25 --rm3
@@ -1508,7 +1508,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics dl21-unicoil-noexp \
   --output run.msmarco-v2-passage.unicoil-noexp.dl21.txt \
   --hits 1000 --impact
@@ -1529,7 +1529,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics dl22-unicoil-noexp \
   --output run.msmarco-v2-passage.unicoil-noexp.dl22.txt \
   --hits 1000 --impact
@@ -1550,7 +1550,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics dl23-unicoil-noexp \
   --output run.msmarco-v2-passage.unicoil-noexp.dl23.txt \
   --hits 1000 --impact
@@ -1571,7 +1571,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics msmarco-v2-passage-dev-unicoil-noexp \
   --output run.msmarco-v2-passage.unicoil-noexp.dev.txt \
   --hits 1000 --impact
@@ -1591,7 +1591,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics msmarco-v2-passage-dev2-unicoil-noexp \
   --output run.msmarco-v2-passage.unicoil-noexp.dev2.txt \
   --hits 1000 --impact
@@ -1666,7 +1666,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics dl21-unicoil \
   --output run.msmarco-v2-passage.unicoil.dl21.txt \
   --hits 1000 --impact
@@ -1687,7 +1687,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics dl22-unicoil \
   --output run.msmarco-v2-passage.unicoil.dl22.txt \
   --hits 1000 --impact
@@ -1708,7 +1708,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics dl23-unicoil \
   --output run.msmarco-v2-passage.unicoil.dl23.txt \
   --hits 1000 --impact
@@ -1729,7 +1729,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics msmarco-v2-passage-dev-unicoil \
   --output run.msmarco-v2-passage.unicoil.dev.txt \
   --hits 1000 --impact
@@ -1749,7 +1749,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics msmarco-v2-passage-dev2-unicoil \
   --output run.msmarco-v2-passage.unicoil.dev2.txt \
   --hits 1000 --impact
@@ -1825,7 +1825,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics dl21 \
   --encoder castorini/unicoil-noexp-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-noexp-otf.dl21.txt \
@@ -1847,7 +1847,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics dl22 \
   --encoder castorini/unicoil-noexp-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-noexp-otf.dl22.txt \
@@ -1869,7 +1869,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics dl23 \
   --encoder castorini/unicoil-noexp-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-noexp-otf.dl23.txt \
@@ -1891,7 +1891,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics msmarco-v2-passage-dev \
   --encoder castorini/unicoil-noexp-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-noexp-otf.dev.txt \
@@ -1912,7 +1912,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-noexp-0shot \
+  --index msmarco-v2-passage.unicoil-noexp-0shot \
   --topics msmarco-v2-passage-dev2 \
   --encoder castorini/unicoil-noexp-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-noexp-otf.dev2.txt \
@@ -1988,7 +1988,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics dl21 \
   --encoder castorini/unicoil-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-otf.dl21.txt \
@@ -2010,7 +2010,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics dl22 \
   --encoder castorini/unicoil-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-otf.dl22.txt \
@@ -2032,7 +2032,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics dl23 \
   --encoder castorini/unicoil-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-otf.dl23.txt \
@@ -2054,7 +2054,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics msmarco-v2-passage-dev \
   --encoder castorini/unicoil-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-otf.dev.txt \
@@ -2075,7 +2075,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-unicoil-0shot \
+  --index msmarco-v2-passage.unicoil-0shot \
   --topics msmarco-v2-passage-dev2 \
   --encoder castorini/unicoil-msmarco-passage \
   --output run.msmarco-v2-passage.unicoil-otf.dev2.txt \
@@ -2151,7 +2151,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-slimr-pp-norefine-0shot \
+  --index msmarco-v2-passage.slimr-pp \
   --topics dl21 \
   --encoder castorini/slimr-pp-msmarco-passage \
   --output run.msmarco-v2-passage.slimr-pp.dl21.txt \
@@ -2173,7 +2173,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-slimr-pp-norefine-0shot \
+  --index msmarco-v2-passage.slimr-pp \
   --topics dl22 \
   --encoder castorini/slimr-pp-msmarco-passage \
   --output run.msmarco-v2-passage.slimr-pp.dl22.txt \
@@ -2195,7 +2195,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-slimr-pp-norefine-0shot \
+  --index msmarco-v2-passage.slimr-pp \
   --topics dl23 \
   --encoder castorini/slimr-pp-msmarco-passage \
   --output run.msmarco-v2-passage.slimr-pp.dl23.txt \
@@ -2217,7 +2217,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-slimr-pp-norefine-0shot \
+  --index msmarco-v2-passage.slimr-pp \
   --topics msmarco-v2-passage-dev \
   --encoder castorini/slimr-pp-msmarco-passage \
   --output run.msmarco-v2-passage.slimr-pp.dev.txt \
@@ -2238,7 +2238,7 @@ 

MS MARCO V2 Passage

python -m pyserini.search.lucene \
   --threads 16 --batch-size 128 \
-  --index msmarco-v2-passage-slimr-pp-norefine-0shot \
+  --index msmarco-v2-passage.slimr-pp \
   --topics msmarco-v2-passage-dev2 \
   --encoder castorini/slimr-pp-msmarco-passage \
   --output run.msmarco-v2-passage.slimr-pp.dev2.txt \
diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md
index 3b564e7c3..8096ecebe 100644
--- a/docs/prebuilt-indexes.md
+++ b/docs/prebuilt-indexes.md
@@ -110,15 +110,13 @@ Detailed configuration information for the pre-built indexes are stored in [`pys
 [readme]
 
Lucene index (+docvectors) of the MS MARCO V1 passage corpus with doc2query-T5 expansions.
-
msmarco-passage-ltr -[readme] -
Lucene index of the MS MARCO passage corpus with four extra preprocessed fields for LTR. (Lucene 8) +
msmarco-v1-passage.ltr +[readme] +
Lucene index of the MS MARCO V1 passage corpus with four extra preprocessed fields for LTR. (Lucene 8)
-
msmarco-doc-per-passage-ltr -
Lucene index of the MS MARCO document per-passage corpus with four extra preprocessed fields for LTR. (Lucene 8) -
-
msmarco-document-segment-ltr -
Lucene index of the MS MARCO document segmented corpus with four extra preprocessed fields for LTR. (Lucene 8) +
msmarco-v1-doc-segmented.ltr +[readme] +
Lucene index of the MS MARCO V1 segmented document corpus with four extra preprocessed fields for LTR. (Lucene 8)
msmarco-v2-doc [readme] @@ -436,48 +434,48 @@ Detailed configuration information for the pre-built indexes are stored in [`pys
-
mrtydi-v1.1-arabic -[readme] +
mrtydi-v1.1-ar +[readme]
Lucene index for Mr.TyDi v1.1 (Arabic).
-
mrtydi-v1.1-bengali -[readme] +
mrtydi-v1.1-bn +[readme]
Lucene index for Mr.TyDi v1.1 (Bengali).
-
mrtydi-v1.1-english -[readme] +
mrtydi-v1.1-en +[readme]
Lucene index for Mr.TyDi v1.1 (English).
-
mrtydi-v1.1-finnish -[readme] +
mrtydi-v1.1-fi +[readme]
Lucene index for Mr.TyDi v1.1 (Finnish).
-
mrtydi-v1.1-indonesian -[readme] +
mrtydi-v1.1-id +[readme]
Lucene index for Mr.TyDi v1.1 (Indonesian).
-
mrtydi-v1.1-japanese -[readme] +
mrtydi-v1.1-ja +[readme]
Lucene index for Mr.TyDi v1.1 (Japanese).
-
mrtydi-v1.1-korean -[readme] +
mrtydi-v1.1-ko +[readme]
Lucene index for Mr.TyDi v1.1 (Korean).
-
mrtydi-v1.1-russian -[readme] +
mrtydi-v1.1-ru +[readme]
Lucene index for Mr.TyDi v1.1 (Russian).
-
mrtydi-v1.1-swahili -[readme] +
mrtydi-v1.1-sw +[readme]
Lucene index for Mr.TyDi v1.1 (Swahili).
-
mrtydi-v1.1-telugu -[readme] +
mrtydi-v1.1-te +[readme]
Lucene index for Mr.TyDi v1.1 (Telugu).
-
mrtydi-v1.1-thai -[readme] +
mrtydi-v1.1-th +[readme]
Lucene index for Mr.TyDi v1.1 (Thai).
diff --git a/integrations/sparse/test_lucenesearcher_check_irst.py b/integrations/sparse/test_lucenesearcher_check_irst.py index fe2a1181f..cd2d9bdd0 100644 --- a/integrations/sparse/test_lucenesearcher_check_irst.py +++ b/integrations/sparse/test_lucenesearcher_check_irst.py @@ -102,7 +102,6 @@ def test_max_aggregation_dl19(self): self.assertEqual(stderr, '') self.assertEqual(map_score, 0.3286) self.assertEqual(ndcg_score, 0.5371) - def test_max_aggregation_dl20_passage(self): # dl20 passage max diff --git a/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py b/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py index d1cf09dca..73da5b4af 100644 --- a/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py +++ b/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py @@ -44,7 +44,7 @@ def test_reranking(self): --topic tools/topics-and-qrels/topics.msmarco-doc.dev.txt \ --model ltr_test/msmarco-passage-ltr-mrr-v1/ \ --qrel tools/topics-and-qrels/qrels.msmarco-doc.dev.txt \ - --index msmarco-doc-per-passage-ltr --ibm-model ltr_test/ibm_model/ \ + --index msmarco-v1-doc-segmented.ltr --ibm-model ltr_test/ibm_model/ \ --granularity document --output ltr_test/{outp} --max-passage --hits 10000') result = subprocess.check_output(f'python tools/scripts/msmarco/msmarco_doc_eval.py --judgments tools/topics-and-qrels/qrels.msmarco-doc.dev.txt --run ltr_test/{outp}', shell=True).decode(sys.stdout.encoding) diff --git a/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py b/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py index acb333aa1..def199306 100644 --- a/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py +++ b/integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py @@ -45,7 +45,7 @@ def test_reranking(self): --model ltr_test/msmarco-passage-ltr-mrr-v1 \ --topic tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ --qrel tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt \ - --index msmarco-passage-ltr --ibm-model ltr_test/ibm_model/ \ + --index msmarco-v1-passage.ltr --ibm-model ltr_test/ibm_model/ \ --output-format tsv --output ltr_test/{outp}') result = subprocess.check_output(f'python tools/scripts/msmarco/msmarco_passage_eval.py tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ltr_test/{outp}', shell=True).decode(sys.stdout.encoding) a,b = result.find('#####################\nMRR @10:'), result.find('\nQueriesRanked: 6980\n#####################\n') diff --git a/pyserini/2cr/mrtydi.yaml b/pyserini/2cr/mrtydi.yaml index 174d81cd3..5ca1c7e7c 100644 --- a/pyserini/2cr/mrtydi.yaml +++ b/pyserini/2cr/mrtydi.yaml @@ -714,7 +714,7 @@ conditions: # BM25 - name: bm25.ar eval_key: mrtydi-v1.1-arabic - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ar --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ar --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-ar --output $output --bm25 --hits 100 splits: - split: train scores: @@ -730,7 +730,7 @@ conditions: R@100: 0.7928 - name: bm25.bn eval_key: mrtydi-v1.1-bengali - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language bn --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language bn --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bn --output $output --bm25 --hits 100 splits: - split: train scores: @@ -746,7 +746,7 @@ conditions: - R@100: 0.8694 - name: bm25.en eval_key: mrtydi-v1.1-english - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language en --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language en --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-en --output $output --bm25 --hits 100 splits: - split: train scores: @@ -762,7 +762,7 @@ conditions: - R@100: 0.5365 - name: bm25.fi eval_key: mrtydi-v1.1-finnish - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language fi --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language fi --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-fi --output $output --bm25 --hits 100 splits: - split: train scores: @@ -778,7 +778,7 @@ conditions: - R@100: 0.7196 - name: bm25.id eval_key: mrtydi-v1.1-indonesian - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language id --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language id --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-id --output $output --bm25 --hits 100 splits: - split: train scores: @@ -794,7 +794,7 @@ conditions: - R@100: 0.8426 - name: bm25.ja eval_key: mrtydi-v1.1-japanese - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ja --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ja --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-ja --output $output --bm25 --hits 100 splits: - split: train scores: @@ -810,7 +810,7 @@ conditions: - R@100: 0.6431 - name: bm25.ko eval_key: mrtydi-v1.1-korean - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ko --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ko --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-ko --output $output --bm25 --hits 100 splits: - split: train scores: @@ -826,7 +826,7 @@ conditions: - R@100: 0.6188 - name: bm25.ru eval_key: mrtydi-v1.1-russian - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ru --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language ru --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-ru --output $output --bm25 --hits 100 splits: - split: train scores: @@ -842,7 +842,7 @@ conditions: - R@100: 0.6541 - name: bm25.sw eval_key: mrtydi-v1.1-swahili - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language sw --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language sw --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-sw --output $output --bm25 --hits 100 splits: - split: train scores: @@ -858,7 +858,7 @@ conditions: - R@100: 0.7642 - name: bm25.te eval_key: mrtydi-v1.1-telugu - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language te --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language te --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-te --output $output --bm25 --hits 100 splits: - split: train scores: @@ -874,7 +874,7 @@ conditions: - R@100: 0.8971 - name: bm25.th eval_key: mrtydi-v1.1-thai - command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language th --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai --output $output --bm25 --hits 100 + command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --language th --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-th --output $output --bm25 --hits 100 splits: - split: train scores: diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 1c429b23f..a30875e9c 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -238,48 +238,34 @@ }, # MS MARCO V1 indexes for LTR experiments. - "msmarco-passage-ltr": { - "description": "Lucene index of the MS MARCO passage corpus with four extra preprocessed fields for LTR. (Lucene 8)", - "filename": "index-msmarco-passage-ltr-20210519-e25e33f.tar.gz", - "readme": "index-msmarco-passage-ltr-20210519-e25e33f-readme.txt", + "msmarco-v1-passage.ltr": { + "description": "Lucene index of the MS MARCO V1 passage corpus with four extra preprocessed fields for LTR. (Lucene 8)", + "filename": "lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.tar.gz", + "readme": "lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.README.txt", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/index-msmarco-passage-ltr-20210519-e25e33f.tar.gz", - "https://vault.cs.uwaterloo.ca/s/8qFCaCtwabRfYQD/download" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.tar.gz" ], - "md5": "a5de642c268ac1ed5892c069bdc29ae3", - "size compressed (bytes)": 14073966046, + "md5": "5da425ca44d2e3e5c38a7f564f13ad23", + "size compressed (bytes)": 14073966165, "total_terms": 352316036, "documents": 8841823, "unique_terms": 2660824, "downloaded": False }, - "msmarco-doc-per-passage-ltr": { - "description": "Lucene index of the MS MARCO document per-passage corpus with four extra preprocessed fields for LTR. (Lucene 8)", - "filename": "index-msmarco-doc-per-passage-ltr-20211031-33e4151.tar.gz", + "msmarco-v1-doc-segmented.ltr": { + "description": "Lucene index of the MS MARCO V1 segmented document corpus with four extra preprocessed fields for LTR. (Lucene 8)", + "filename": "lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.tar.gz", + "readme": "lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.README.txt", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/index-msmarco-doc-per-passage-ltr-20211031-33e4151.tar.gz", - "https://vault.cs.uwaterloo.ca/s/kNdXMWXEsTt3fT8/download" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.tar.gz", ], - "md5": "bd60e89041b4ebbabc4bf0cfac608a87", - "size compressed (bytes)": 45835520960, + "md5": "86f108d8441b6845f8caf1208dd7ac7a", + "size compressed (bytes)": 45835523273, "total_terms": 1232004740, "documents": 20545628, "unique_terms": 10123678, "downloaded": False }, - "msmarco-document-segment-ltr": { - "description": "Lucene index of the MS MARCO document segmented corpus with four extra preprocessed fields for LTR. (Lucene 8)", - "filename": "lucene-index.msmarco-doc-segmented.ibm.tar.gz", - "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-doc-segmented.ibm.tar.gz" - ], - "md5": "13064bdaf8e8a79222634d67ecd3ddb5", - "size compressed (bytes)": 98984853515, - "total_terms": 3197500226, - "documents": 20532330, - "unique_terms": -1, - "downloaded": False - }, # MS MARCO V2 document corpus, three indexes with different amounts of information (and sizes). "msmarco-v2-doc": { @@ -1404,155 +1390,155 @@ } TF_INDEX_INFO_MRTYDI = { - "mrtydi-v1.1-arabic": { + "mrtydi-v1.1-ar": { "description": "Lucene index for Mr.TyDi v1.1 (Arabic).", - "filename": "lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-ar.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.tar.gz", + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-ar.20220928.b5ecc5.tar.gz", ], - "md5": "efff40a2548f759eb8b0e47e0622685b", - "size compressed (bytes)": 1420441600, + "md5": "eae9ed3430eeb685328c92be886197d3", + "size compressed (bytes)": 1166813697, "total_terms": 92529032, "documents": 2106586, "unique_terms": 1284748, "downloaded": False }, - "mrtydi-v1.1-bengali": { + "mrtydi-v1.1-bn": { "description": "Lucene index for Mr.TyDi v1.1 (Bengali).", - "filename": "lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-bn.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-bn.20220928.b5ecc5.tar.gz" ], - "md5": "6ed844c8f17b2f041fba7c5676d3fb42", - "size compressed (bytes)": 294942720, + "md5": "4fdb127c48c311851875d884ccf9d3e7", + "size compressed (bytes)": 240163624, "total_terms": 15236599, "documents": 304059, "unique_terms": 520699, "downloaded": False }, - "mrtydi-v1.1-english": { + "mrtydi-v1.1-en": { "description": "Lucene index for Mr.TyDi v1.1 (English).", - "filename": "lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-en.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-en.20220928.b5ecc5.tar.gz" ], - "md5": "e6b0a2531d958c3d1a65634dc315b0ab", - "size compressed (bytes)": 20566118400, + "md5": "98eb2e28b120c9fac2aa43fa370d0d27", + "size compressed (bytes)": 16732696856, "total_terms": 1507060932, "documents": 32907100, "unique_terms": -1, "downloaded": False }, - "mrtydi-v1.1-finnish": { + "mrtydi-v1.1-fi": { "description": "Lucene index for Mr.TyDi v1.1 (Finnish).", - "filename": "lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-fi.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-fi.20220928.b5ecc5.tar.gz" ], - "md5": "0f464c022447eed5431157f0b2feb0b3", - "size compressed (bytes)": 1116272640, + "md5": "c5546562c77bd28bb1ab68ca24c5b37a", + "size compressed (bytes)": 906509667, "total_terms": 69416543, "documents": 1908757, "unique_terms": 1715076, "downloaded": False }, - "mrtydi-v1.1-indonesian": { + "mrtydi-v1.1-id": { "description": "Lucene index for Mr.TyDi v1.1 (Indonesian).", - "filename": "lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-id.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-id.20220928.b5ecc5.tar.gz" ], - "md5": "345d43a2443786a3394a93a6f7ef77b7", - "size compressed (bytes)": 698388480, + "md5": "551b282737fd885b1904ea365768c066", + "size compressed (bytes)": 562187620, "total_terms": 52493134, "documents": 1469399, "unique_terms": 942552, "downloaded": False }, - "mrtydi-v1.1-japanese": { + "mrtydi-v1.1-ja": { "description": "Lucene index for Mr.TyDi v1.1 (Japanese).", - "filename": "lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-ja.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-ja.20220928.b5ecc5.tar.gz" ], - "md5": "5f0802c1257c325a3e25c58523dba841", - "size compressed (bytes)": 4333844480, + "md5": "6f5062b65c69ce37c8a4e76d08e6d5c4", + "size compressed (bytes)": 3637126847, "total_terms": 300761975, "documents": 7000027, "unique_terms": 1588879, "downloaded": False }, - "mrtydi-v1.1-korean": { + "mrtydi-v1.1-ko": { "description": "Lucene index for Mr.TyDi v1.1 (Korean).", - "filename": "lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-ko.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-ko.20220928.b5ecc5.tar.gz" ], - "md5": "4277f406b138c46edf7c17e4248f3b2e", - "size compressed (bytes)": 1349109760, + "md5": "a52bfe4be87fd3178ca231b007e7af38", + "size compressed (bytes)": 1137658449, "total_terms": 122217295, "documents": 1496126, "unique_terms": 1517179, "downloaded": False }, - "mrtydi-v1.1-russian": { + "mrtydi-v1.1-ru": { "description": "Lucene index for Mr.TyDi v1.1 (Russian).", - "filename": "lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-ru.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-ru.20220928.b5ecc5.tar.gz" ], - "md5": "d5837fee29c60c7a3a24cfd598056038", - "size compressed (bytes)": 6864660480, + "md5": "dc1d75d31595252e3a6b665b522adf3b", + "size compressed (bytes)": 5642484260, "total_terms": 346329117, "documents": 9597504, "unique_terms": 3034240, "downloaded": False }, - "mrtydi-v1.1-swahili": { + "mrtydi-v1.1-sw": { "description": "Lucene index for Mr.TyDi v1.1 (Swahili).", - "filename": "lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-sw.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-sw.20220928.b5ecc5.tar.gz" ], - "md5": "bebff76ec6dfe76c904604f8ed1bcd3e", - "size compressed (bytes)": 59607040, + "md5": "4514e4a3f35279408a2e0aea3051dce0", + "size compressed (bytes)": 47557469, "total_terms": 4937051, "documents": 136689, "unique_terms": 385711, "downloaded": False }, - "mrtydi-v1.1-telugu": { + "mrtydi-v1.1-te": { "description": "Lucene index for Mr.TyDi v1.1 (Telugu).", - "filename": "lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-te.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-te.20220928.b5ecc5.tar.gz" ], - "md5": "89f8b280cacbdc27e90bb1ea40029c21", - "size compressed (bytes)": 519157760, + "md5": "4bd62ef02febb49a487e765c023507de", + "size compressed (bytes)": 413881826, "total_terms": 26812052, "documents": 548224, "unique_terms": 1157217, "downloaded": False }, - "mrtydi-v1.1-thai": { + "mrtydi-v1.1-th": { "description": "Lucene index for Mr.TyDi v1.1 (Thai).", - "filename": "lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.tar.gz", - "readme": "lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md", + "filename": "lucene-inverted.mrtydi-v1.1-th.20220928.b5ecc5.tar.gz", + "readme": "lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md", "urls": [ - "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.tar.gz" + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.mrtydi-v1.1-th.20220928.b5ecc5.tar.gz" ], - "md5": "047152fc6bc1b5c5d945f38b23de971e", - "size compressed (bytes)": 546201600, + "md5": "d040c15b460d488571eb959192335771", + "size compressed (bytes)": 450209352, "total_terms": 31550936, "documents": 568855, "unique_terms": 663628, @@ -1560,6 +1546,20 @@ } } +TF_INDEX_INFO_MRTYDI_ALIASES = { + "mrtydi-v1.1-arabic": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-ar"], + "mrtydi-v1.1-bengali": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-bn"], + "mrtydi-v1.1-english": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-en"], + "mrtydi-v1.1-finnish": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-fi"], + "mrtydi-v1.1-indonesian": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-id"], + "mrtydi-v1.1-japanese": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-ja"], + "mrtydi-v1.1-korean": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-ko"], + "mrtydi-v1.1-russian": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-ru"], + "mrtydi-v1.1-swahili": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-sw"], + "mrtydi-v1.1-telugu": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-te"], + "mrtydi-v1.1-thai": TF_INDEX_INFO_MRTYDI["mrtydi-v1.1-th"] +} + TF_INDEX_INFO_MIRACL = { "miracl-v1.0-ar": { "description": "Lucene index for MIRACL v1.0 (Arabic).", @@ -2498,6 +2498,7 @@ **TF_INDEX_INFO_MSMARCO_ALIASES, **TF_INDEX_INFO_BEIR, **TF_INDEX_INFO_MRTYDI, + **TF_INDEX_INFO_MRTYDI_ALIASES, **TF_INDEX_INFO_MIRACL, **TF_INDEX_INFO_CIRAL, **TF_INDEX_INFO_OTHER, @@ -2790,7 +2791,9 @@ IMPACT_INDEX_INFO_MSMARCO_ALIASES = { # To preserve working commands in published papers: integrations/papers/test_sigir2022.py testcase test_Trotman_etal - "msmarco-passage-unicoil-d2q": IMPACT_INDEX_INFO_MSMARCO["msmarco-v1-passage.unicoil"] + "msmarco-passage-unicoil-d2q": IMPACT_INDEX_INFO_MSMARCO["msmarco-v1-passage.unicoil"], + # To preserve working commands in published papers: integrations/papers/test_sigir2022.py testcase test_Ma_etal_section4_1b + "msmarco-v2-passage-unicoil-0shot": IMPACT_INDEX_INFO_MSMARCO["msmarco-v2-passage.unicoil-0shot"] } IMPACT_INDEX_INFO_BEIR = { diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md deleted file mode 100644 index 0b3e01f26..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-arabic - -Lucene index for Mr.TyDi v1.1 (Arabic). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-arabic/ \ - -index indexes/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language ar -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md deleted file mode 100644 index 451c03a78..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-arabic - -Lucene index for Mr.TyDi v1.1 (Arabic). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=arabic -abbr=ar - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md deleted file mode 100644 index 8d6565e9d..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-bengali - -Lucene index for Mr.TyDi v1.1 (Bengali). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-bengali/ \ - -index indexes/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language bn -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md deleted file mode 100644 index a7a1e252a..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-bengali - -Lucene index for Mr.TyDi v1.1 (Bengali). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=bengali -abbr=bn - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md deleted file mode 100644 index 8c4f02f19..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-english - -Lucene index for Mr.TyDi v1.1 (English). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-english/ \ - -index indexes/lucene-index.mrtydi-v1.1-english.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language en -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md deleted file mode 100644 index b664d41a3..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-english - -Lucene index for Mr.TyDi v1.1 (English). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=english -abbr=en - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md deleted file mode 100644 index 10b161844..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-finnish - -Lucene index for Mr.TyDi v1.1 (Finnish). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-finnish/ \ - -index indexes/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language fi -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md deleted file mode 100644 index 156c59490..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-finnish - -Lucene index for Mr.TyDi v1.1 (Finnish). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=finnish -abbr=fi - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md deleted file mode 100644 index 13570f274..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-indonesian - -Lucene index for Mr.TyDi v1.1 (Indonesian). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-indonesian/ \ - -index indexes/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language id -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md deleted file mode 100644 index 0a8b36fe6..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-indonesian - -Lucene index for Mr.TyDi v1.1 (Indonesian). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=indonesian -abbr=id - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md deleted file mode 100644 index a78a572a0..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-japanese - -Lucene index for Mr.TyDi v1.1 (Japanese). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-japanese/ \ - -index indexes/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language ja -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md deleted file mode 100644 index 3fc37ed50..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-japanese - -Lucene index for Mr.TyDi v1.1 (Japanese). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=japanese -abbr=ja - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md deleted file mode 100644 index 3810f6261..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-korean - -Lucene index for Mr.TyDi v1.1 (Korean). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-korean/ \ - -index indexes/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language ko -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md deleted file mode 100644 index ed106fd28..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-korean - -Lucene index for Mr.TyDi v1.1 (Korean). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=korean -abbr=ko - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md deleted file mode 100644 index 7df1ccbf9..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-russian - -Lucene index for Mr.TyDi v1.1 (Russian). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-russian/ \ - -index indexes/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language ru -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md deleted file mode 100644 index 3ee7340f3..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-russian - -Lucene index for Mr.TyDi v1.1 (Russian). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=russian -abbr=ru - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md deleted file mode 100644 index 5d7fd59cf..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mrtydi-v1.1-swahili - -Lucene index for Mr.TyDi v1.1 (Swahili). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-swahili/ \ - -index indexes/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -pretokenized -``` - -Note that `-language sw` gives identical results (and is more semantically accurate) but since we do not have a language-specific tokenizer here, we just use the whitespace tokenizer, which is what `-pretokenized` uses. -This index was built based on Anserini regressions at the time; see [Anserini #1727](https://github.com/castorini/anserini/pull/1727). \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md deleted file mode 100644 index 02e7bb875..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-swahili - -Lucene index for Mr.TyDi v1.1 (Swahili). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=swahili -abbr=sw - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md deleted file mode 100644 index eedd6e194..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md +++ /dev/null @@ -1,16 +0,0 @@ -# mrtydi-v1.1-telugu - -Lucene index for Mr.TyDi v1.1 (Telugu). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-telugu/ \ - -index indexes/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -pretokenized -``` - -Note that `-language te` gives identical results (and is more semantically accurate) but since we do not have a language-specific tokenizer here, we just use the whitespace tokenizer, which is what `-pretokenized` uses. -This index was built based on Anserini regressions at the time; see [Anserini #1727](https://github.com/castorini/anserini/pull/1727). \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md deleted file mode 100644 index 2f4bcd9b0..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-telugu - -Lucene index for Mr.TyDi v1.1 (Telugu). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=telugu -abbr=te - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md deleted file mode 100644 index 296e30f8a..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md +++ /dev/null @@ -1,13 +0,0 @@ -# mrtydi-v1.1-thai - -Lucene index for Mr.TyDi v1.1 (Thai). - -This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: - -``` -target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ - -generator DefaultLuceneDocumentGenerator -threads 1 \ - -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-thai/ \ - -index indexes/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89/ \ - -storePositions -storeDocvectors -storeRaw -optimize -language th -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md deleted file mode 100644 index 5a23bafff..000000000 --- a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md +++ /dev/null @@ -1,17 +0,0 @@ -# mrtydi-v1.1-thai - -Lucene index for Mr.TyDi v1.1 (Thai). - -This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: - -``` -lang=thai -abbr=th - -target/appassembler/bin/IndexCollection \ - -collection MrTyDiCollection \ - -input MrTyDi/miracl-corpus-v1.0-$lang \ - -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ - -generator DefaultLuceneDocumentGenerator \ - -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr -``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md new file mode 100644 index 000000000..3430111f6 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-inverted.mrtydi-v1.1.20220928.b5ecc5.README.md @@ -0,0 +1,160 @@ +# mrtydi-v1.1 + +The following indexes were built on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca`. + +At the time each index was built, the full name of the language was used. +In May 2024, as part of repackaging indexes to adopt a more consistent naming scheme, the indexes were renamed to use standard two-letter language codes (e.g., `mrtydi-v1.1-ar` instead of `mrtydi-v1.1-arabic`). + +**mrtydi-v1.1-arabic**: Lucene index for Mr.TyDi v1.1 (Arabic). + +``` +lang=arabic +abbr=ar + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-bengali**: Lucene index for Mr.TyDi v1.1 (Bengali). + +``` +lang=bengali +abbr=bn + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-english**: Lucene index for Mr.TyDi v1.1 (English). + +``` +lang=english +abbr=en + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-finnish**: Lucene index for Mr.TyDi v1.1 (Finnish). + +``` +lang=finnish +abbr=fi + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-indonesian**: Lucene index for Mr.TyDi v1.1 (Indonesian). + +``` +lang=indonesian +abbr=id + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-japanese**: Lucene index for Mr.TyDi v1.1 (Japanese). + +``` +lang=japanese +abbr=ja + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-korean**: Lucene index for Mr.TyDi v1.1 (Korean). + +``` +lang=korean +abbr=ko + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-russian**: Lucene index for Mr.TyDi v1.1 (Russian). + +``` +lang=russian +abbr=ru + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-swahili**: Lucene index for Mr.TyDi v1.1 (Swahili). + +``` +lang=swahili +abbr=sw + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-telugu**: Lucene index for Mr.TyDi v1.1 (Telugu). + +``` +lang=telugu +abbr=te + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` + +**mrtydi-v1.1-thai**: Lucene index for Mr.TyDi v1.1 (Thai). + +``` +lang=thai +abbr=th + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-ltr-readme.txt b/pyserini/resources/index-metadata/lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.README.txt similarity index 100% rename from pyserini/resources/index-metadata/index-msmarco-doc-per-passage-ltr-readme.txt rename to pyserini/resources/index-metadata/lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.README.txt diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt b/pyserini/resources/index-metadata/lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.README.txt similarity index 100% rename from pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt rename to pyserini/resources/index-metadata/lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.README.txt diff --git a/pyserini/search/lucene/ltr/__main__.py b/pyserini/search/lucene/ltr/__main__.py index 1278487c3..4bef7c9eb 100644 --- a/pyserini/search/lucene/ltr/__main__.py +++ b/pyserini/search/lucene/ltr/__main__.py @@ -14,19 +14,13 @@ # limitations under the License. # -import sys - -# We're going to explicitly use a local installation of Pyserini (as opposed to a pip-installed one). -# Comment these lines out to use a pip-installed one instead. -sys.path.insert(0, './') - import argparse import numpy as np import pandas as pd + from tqdm import tqdm from collections import defaultdict from transformers import AutoTokenizer -from pyserini.search.lucene.ltr._search_msmarco import MsmarcoLtrSearcher from pyserini.search.lucene.ltr import * from pyserini.search.lucene import LuceneSearcher from pyserini.analysis import Analyzer, get_lucene_analyzer @@ -34,36 +28,37 @@ """ Running prediction on candidates """ + + def dev_data_loader(file, format, topic, rerank, prebuilt, qrel, granularity, top=1000): if rerank: if format == 'tsv': - dev = pd.read_csv(file, sep="\t", - names=['qid', 'pid', 'rank'], - dtype={'qid': 'S','pid': 'S', 'rank':'i',}) + dev = pd.read_csv(file, sep="\t", names=['qid', 'pid', 'rank'], + dtype={'qid': 'S', 'pid': 'S', 'rank': 'i'}) elif format == 'trec': dev = pd.read_csv(file, sep="\s+", - names=['qid', 'q0', 'pid', 'rank', 'score', 'tag'], - usecols=['qid', 'pid', 'rank'], - dtype={'qid': 'S','pid': 'S', 'rank':'i',}) + names=['qid', 'q0', 'pid', 'rank', 'score', 'tag'], + usecols=['qid', 'pid', 'rank'], + dtype={'qid': 'S', 'pid': 'S', 'rank': 'i', }) else: raise Exception('unknown parameters') assert dev['qid'].dtype == object assert dev['pid'].dtype == object assert dev['rank'].dtype == np.int32 - dev = dev[dev['rank']<=top] + dev = dev[dev['rank'] <= top] else: if prebuilt: bm25search = LuceneSearcher.from_prebuilt_index(args.index) else: bm25search = LuceneSearcher(args.index) bm25search.set_bm25(0.82, 0.68) - dev_dic = {"qid":[], "pid":[], "rank":[]} + dev_dic = {"qid": [], "pid": [], "rank": []} for topic in tqdm(queries.keys()): query_text = queries[topic]['raw'] bm25_dev = bm25search.search(query_text, args.hits) doc_ids = [bm25_result.docid for bm25_result in bm25_dev] qid = [topic for _ in range(len(doc_ids))] - rank = [i for i in range(1, len(doc_ids)+1)] + rank = [i for i in range(1, len(doc_ids) + 1)] dev_dic['qid'].extend(qid) dev_dic['pid'].extend(doc_ids) dev_dic['rank'].extend(rank) @@ -74,8 +69,8 @@ def dev_data_loader(file, format, topic, rerank, prebuilt, qrel, granularity, to else: seperation = " " dev_qrel = pd.read_csv(qrel, sep=seperation, - names=["qid", "q0", "pid", "rel"], usecols=['qid', 'pid', 'rel'], - dtype={'qid': 'S','pid': 'S', 'rel':'i'}) + names=["qid", "q0", "pid", "rel"], usecols=['qid', 'pid', 'rel'], + dtype={'qid': 'S', 'pid': 'S', 'rel': 'i'}) dev = dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left') dev['rel'] = dev['rel'].fillna(0).astype(np.int32) dev = dev.sort_values(['qid', 'pid']).set_index(['qid', 'pid']) @@ -138,11 +133,11 @@ def query_loader(topic): print(analyzed) query_toks = query_lemmas.split() if len(query_toks) >= 0: - query = {"raw" : query, - "text": query_lemmas.split(' '), - "text_unlemm": query_unlemm.split(' '), - "analyzed": analyzed, - "text_bert_tok": bert_tokenizer.tokenize(query.lower())} + query = {"raw": query, + "text": query_lemmas.split(' '), + "text_unlemm": query_unlemm.split(' '), + "analyzed": analyzed, + "text_bert_tok": bert_tokenizer.tokenize(query.lower())} queries[did] = query if ln % 10000 == 0: @@ -189,7 +184,7 @@ def eval_recall(dev_qrel, dev_data): score_tie_counter = 0 score_tie_query = set() - recall_point = [10,20,50,100,200,250,300,333,400,500,1000] + recall_point = [10, 20, 50, 100, 200, 250, 300, 333, 400, 500, 1000] recall_curve = {k: [] for k in recall_point} for qid, group in tqdm(dev_data.groupby('qid')): group = group.reset_index() @@ -230,7 +225,7 @@ def eval_recall(dev_qrel, dev_data): def output(file, dev_data, format, maxp): score_tie_counter = 0 score_tie_query = set() - output_file = open(file,'w') + output_file = open(file, 'w') results = defaultdict(dict) idx = 0 for qid, group in tqdm(dev_data.groupby('qid')): @@ -250,14 +245,13 @@ def output(file, dev_data, format, maxp): results[qid][docid] = t.score else: results[qid][t.pid] = t.score - for qid in tqdm(results.keys()): rank = 1 docid_score = results[qid] - docid_score = sorted(docid_score.items(),key=lambda kv: kv[1], reverse=True) + docid_score = sorted(docid_score.items(), key=lambda kv: kv[1], reverse=True) for docid, score in docid_score: - if format=='trec': + if format == 'trec': output_file.write(f"{qid}\tQ0\t{docid}\t{rank}\t{score}\tltr\n") else: output_file.write(f"{qid}\t{docid}\t{rank}\n") @@ -265,11 +259,12 @@ def output(file, dev_data, format, maxp): score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries' print(score_tie) + if __name__ == "__main__": parser = argparse.ArgumentParser(description='Learning to rank reranking') parser.add_argument('--input', default='') parser.add_argument('--hits', type=int, default=1000) - parser.add_argument('--input-format', default = 'trec') + parser.add_argument('--input-format', default='trec') parser.add_argument('--model', required=True) parser.add_argument('--index', required=True) parser.add_argument('--output', required=True) @@ -284,8 +279,9 @@ def output(file, dev_data, format, maxp): args = parser.parse_args() queries = query_loader(args.topic) print("---------------------loading dev----------------------------------------") - prebuilt = args.index == 'msmarco-passage-ltr' or args.index == 'msmarco-doc-per-passage-ltr' - dev, dev_qrel = dev_data_loader(args.input, args.input_format, args.topic, args.rerank, prebuilt, args.qrel, args.granularity, args.hits) + prebuilt = args.index == 'msmarco-v1-passage.ltr' or args.index == 'msmarco-v1-doc-segmented.ltr' + dev, dev_qrel = dev_data_loader(args.input, args.input_format, args.topic, args.rerank, prebuilt, args.qrel, + args.granularity, args.hits) searcher = MsmarcoLtrSearcher(args.model, args.ibm_model, args.index, args.granularity, prebuilt, args.topic) searcher.add_fe() batch_info = searcher.search(dev, queries) @@ -293,5 +289,5 @@ def output(file, dev_data, format, maxp): eval_res = eval_mrr(batch_info) eval_recall(dev_qrel, batch_info) - output(args.output, batch_info,args.output_format, args.max_passage) - print('Done!') \ No newline at end of file + output(args.output, batch_info, args.output_format, args.max_passage) + print('Done!') diff --git a/pyserini/search/lucene/ltr/_search_msmarco.py b/pyserini/search/lucene/ltr/_search_msmarco.py index 7b1bb9504..24f9ea9aa 100644 --- a/pyserini/search/lucene/ltr/_search_msmarco.py +++ b/pyserini/search/lucene/ltr/_search_msmarco.py @@ -43,9 +43,9 @@ def __init__(self, model: str, ibm_model:str, index:str, data: str, prebuilt: bo self.lucene_searcher = LuceneSearcher.from_prebuilt_index(index) index_directory = os.path.join(get_cache_home(), 'indexes') if data == 'passage': - index_path = os.path.join(index_directory, 'index-msmarco-passage-ltr-20210519-e25e33f.a5de642c268ac1ed5892c069bdc29ae3') + index_path = os.path.join(index_directory, 'lucene-inverted.msmarco-v1-passage.ltr.20210519.e25e33f.5da425ca44d2e3e5c38a7f564f13ad23') else: - index_path = os.path.join(index_directory, 'index-msmarco-doc-per-passage-ltr-20211031-33e4151.bd60e89041b4ebbabc4bf0cfac608a87') + index_path = os.path.join(index_directory, 'lucene-inverted.msmarco-v1-doc-segmented.ltr.20211031.33e4151.86f108d8441b6845f8caf1208dd7ac7a') self.index_reader = IndexReader.from_prebuilt_index(index) else: index_path = index @@ -53,7 +53,6 @@ def __init__(self, model: str, ibm_model:str, index:str, data: str, prebuilt: bo self.fe = FeatureExtractor(index_path, max(multiprocessing.cpu_count()//2, 1)) self.data = data - def add_fe(self): #self.fe.add(RunList('collections/msmarco-ltr-passage/run.monot5.run_list.whole.trec','t5')) #self.fe.add(RunList('../bert.whole.doc.trec','bert')) diff --git a/tests/test_prebuilt_index.py b/tests/test_prebuilt_index.py index 0a14f7c31..f1d0bbee9 100644 --- a/tests/test_prebuilt_index.py +++ b/tests/test_prebuilt_index.py @@ -43,8 +43,8 @@ def test_tf_mrtydi(self): for url in TF_INDEX_INFO[key]['urls']: urls.append(url) - # 11 languages - self.assertEqual(cnt, 11) + # 11 languages, but two entries for each language from aliases (e.g., arabic and ar) + self.assertEqual(cnt, 22) self._test_urls(urls) def test_tf_miracl(self):