From 9863611d307773c086e64496a2a94cf6599c28b0 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Sat, 20 Apr 2024 10:01:47 -0400 Subject: [PATCH] Add bindings for MS MARCO V2.1 prebuilt indexes + qrels (#2459) --- src/main/java/io/anserini/eval/Qrels.java | 5 + .../java/io/anserini/index/IndexInfo.java | 14 ++ .../anserini/eval/RelevanceJudgmentsTest.java | 137 ++++++++++++++++-- .../io/anserini/index/PrebuiltIndexTest.java | 2 +- tools | 2 +- 5 files changed, 142 insertions(+), 18 deletions(-) diff --git a/src/main/java/io/anserini/eval/Qrels.java b/src/main/java/io/anserini/eval/Qrels.java index 1a563abf50..a026b55112 100644 --- a/src/main/java/io/anserini/eval/Qrels.java +++ b/src/main/java/io/anserini/eval/Qrels.java @@ -50,12 +50,17 @@ public enum Qrels { TREC2022_DL_PASSAGE("qrels.dl22-passage.txt"), TREC2023_DL_DOC("qrels.dl23-doc.txt"), TREC2023_DL_PASSAGE("qrels.dl23-passage.txt"), + TREC2021_DL_DOC_MSMARCO_V21("qrels.dl21-doc-msmarco-v2.1.txt"), + TREC2022_DL_DOC_MSMARCO_V21("qrels.dl22-doc-msmarco-v2.1.txt"), + TREC2023_DL_DOC_MSMARCO_V21("qrels.dl23-doc-msmarco-v2.1.txt"), MSMARCO_DOC_DEV("qrels.msmarco-doc.dev.txt"), MSMARCO_PASSAGE_DEV_SUBSET("qrels.msmarco-passage.dev-subset.txt"), MSMARCO_V2_DOC_DEV("qrels.msmarco-v2-doc.dev.txt"), MSMARCO_V2_DOC_DEV2("qrels.msmarco-v2-doc.dev2.txt"), MSMARCO_V2_PASSAGE_DEV("qrels.msmarco-v2-passage.dev.txt"), MSMARCO_V2_PASSAGE_DEV2("qrels.msmarco-v2-passage.dev2.txt"), + MSMARCO_V21_DOC_DEV("qrels.msmarco-v2.1-doc.dev.txt"), + MSMARCO_V21_DOC_DEV2("qrels.msmarco-v2.1-doc.dev2.txt"), NTCIR8_ZH("qrels.ntcir8.eval.txt"), CLEF2006_FR("qrels.clef06fr.txt"), TREC2002_AR("qrels.trec02ar.txt"), diff --git a/src/main/java/io/anserini/index/IndexInfo.java b/src/main/java/io/anserini/index/IndexInfo.java index 251b5f6e29..0d168d8431 100644 --- a/src/main/java/io/anserini/index/IndexInfo.java +++ b/src/main/java/io/anserini/index/IndexInfo.java @@ -103,6 +103,20 @@ public enum IndexInfo { "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.tar.gz" }, "8a5f444fa5a63cc5d4ddc3e6dd15faa0"), + MSMARCO_V21_DOC("msmarco-v2.1-doc", + "Lucene index of the MS MARCO V2.1 document corpus.", + "lucene-inverted.msmarco-v2.1-doc.20240418.4f9675.tar.gz", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v2.1-doc.20240418.4f9675.tar.gz" }, + "cecd55856c34afa82f1a499705c9df02"), + + MSMARCO_V21_DOC_SEGMENTED("msmarco-v2.1-doc-segmented", + "Lucene index of the MS MARCO V2.1 segmented document corpus.", + "lucene-inverted.msmarco-v2.1-doc-segmented.20240418.4f9675.tar.gz", + new String[] { + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v2.1-doc-segmented.20240418.4f9675.tar.gz" }, + "6ec4cd595c9fe1ad91b43eabb39a637c"), + // BEIR: flat BEIR_V1_0_0_TREC_COVID_FLAT("beir-v1.0.0-trec-covid.flat", "Lucene inverted 'flat' index of BEIR collection 'trec-covid'.", diff --git a/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java b/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java index b96c6dcdff..9e36642e40 100644 --- a/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java +++ b/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java @@ -36,6 +36,11 @@ public int getQrelsCount(RelevanceJudgments qrels) throws IOException{ return count; } + @Test + public void testTotalCount() { + assertEquals(169, Qrels.values().length); + } + @Test(expected = IOException.class) public void testFileNotFound() throws IOException{ // Purposely read non-existent file. @@ -205,6 +210,28 @@ public void testTrec21DLPassage() throws IOException{ assertEquals(1, qrels.getRelevanceGrade("1129560", "msmarco_passage_67_937656589")); } + @Test + public void testTrec21DLDocMsMarcoV21() throws IOException{ + // % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt | sort | uniq | wc + // 57 57 412 + // % wc tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt + // 10973 43892 456277 tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt + + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt"); + assertNotNull(qrels); + assertEquals(57, qrels.getQids().size()); + assertEquals(10973, getQrelsCount(qrels)); + assertEquals(2, qrels.getRelevanceGrade("2082", "msmarco_v2.1_doc_01_1281570012")); + assertEquals(2, qrels.getRelevanceGrade("1128632", "msmarco_v2.1_doc_17_481617788")); + + qrels = RelevanceJudgments.fromQrels(Qrels.TREC2021_DL_DOC_MSMARCO_V21); + assertNotNull(qrels); + assertEquals(57, qrels.getQids().size()); + assertEquals(10973, getQrelsCount(qrels)); + assertEquals(2, qrels.getRelevanceGrade("2082", "msmarco_v2.1_doc_01_1281570012")); + assertEquals(2, qrels.getRelevanceGrade("1128632", "msmarco_v2.1_doc_17_481617788")); + } + @Test public void testTrec22DLDoc() throws IOException{ // % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl22-doc.txt | uniq | wc @@ -249,6 +276,28 @@ public void testTrec22DLPassage() throws IOException{ assertEquals(1, qrels.getRelevanceGrade("2056323", "msmarco_passage_68_715747739")); } + @Test + public void testTrec22DLDocMsMarcoV21() throws IOException{ + // % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt | sort | uniq | wc + // 76 76 608 + // % wc tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt + // 349541 1398164 14786970 tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt + + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt"); + assertNotNull(qrels); + assertEquals(76, qrels.getQids().size()); + assertEquals(349541, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("2000511", "msmarco_v2.1_doc_00_896525856")); + assertEquals(2, qrels.getRelevanceGrade("2056158", "msmarco_v2.1_doc_06_934688453")); + + qrels = RelevanceJudgments.fromQrels(Qrels.TREC2022_DL_DOC_MSMARCO_V21); + assertNotNull(qrels); + assertEquals(76, qrels.getQids().size()); + assertEquals(349541, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("2000511", "msmarco_v2.1_doc_00_896525856")); + assertEquals(2, qrels.getRelevanceGrade("2056158", "msmarco_v2.1_doc_06_934688453")); + } + @Test public void testTrec23DLDoc() throws IOException{ // % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl23-doc.txt | uniq | wc @@ -293,6 +342,28 @@ public void testTrec23DLPassage() throws IOException{ assertEquals(2, qrels.getRelevanceGrade("3100922", "msmarco_passage_22_487548813")); } + @Test + public void testTrec23DLDocMsMarcoV21() throws IOException{ + // % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt | uniq | wc + // 82 82 656 + // % wc tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt + // 15995 63980 677618 tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt + + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt"); + assertNotNull(qrels); + assertEquals(82, qrels.getQids().size()); + assertEquals(15995, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("2001010", "msmarco_v2.1_doc_00_1372241967")); + assertEquals(2, qrels.getRelevanceGrade("3100922", "msmarco_v2.1_doc_19_1982402861")); + + qrels = RelevanceJudgments.fromQrels(Qrels.TREC2023_DL_DOC_MSMARCO_V21); + assertNotNull(qrels); + assertEquals(82, qrels.getQids().size()); + assertEquals(15995, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("2001010", "msmarco_v2.1_doc_00_1372241967")); + assertEquals(2, qrels.getRelevanceGrade("3100922", "msmarco_v2.1_doc_19_1982402861")); + } + @Test public void testMsmarcoDocDev() throws IOException{ RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-doc.dev.txt"); @@ -328,37 +399,37 @@ public void testMsmarcoPassageDevSubset() throws IOException{ } @Test - public void testMsmarcoV2DocDev() throws IOException{ - RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt"); + public void testMsmarcoV2DocDevMsMarcoV21() throws IOException{ + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2.1-doc.dev.txt"); assertNotNull(qrels); assertEquals(4552, qrels.getQids().size()); assertEquals(4702, getQrelsCount(qrels)); - assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121")); - assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348")); + assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_v2.1_doc_17_1968189952")); + assertEquals(1, qrels.getRelevanceGrade("999897", "msmarco_v2.1_doc_46_191673440")); - qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV); + qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V21_DOC_DEV); assertNotNull(qrels); assertEquals(4552, qrels.getQids().size()); assertEquals(4702, getQrelsCount(qrels)); - assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121")); - assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348")); + assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_v2.1_doc_17_1968189952")); + assertEquals(1, qrels.getRelevanceGrade("999897", "msmarco_v2.1_doc_46_191673440")); } @Test - public void testMsmarcoV2DocDev2() throws IOException{ - RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev2.txt"); + public void testMsmarcoV2DocDev2MsMarcoV21() throws IOException{ + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2.1-doc.dev2.txt"); assertNotNull(qrels); assertEquals(5000, qrels.getQids().size()); - assertEquals(5178, getQrelsCount(qrels)); - assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062")); - assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607")); + assertEquals(5177, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_v2.1_doc_08_69146701")); + assertEquals(1, qrels.getRelevanceGrade("999659", "msmarco_v2.1_doc_08_1247437925")); - qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV2); + qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V21_DOC_DEV2); assertNotNull(qrels); assertEquals(5000, qrels.getQids().size()); - assertEquals(5178, getQrelsCount(qrels)); - assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062")); - assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607")); + assertEquals(5177, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_v2.1_doc_08_69146701")); + assertEquals(1, qrels.getRelevanceGrade("999659", "msmarco_v2.1_doc_08_1247437925")); } @Test @@ -395,6 +466,40 @@ public void testMsmarcoV2DocPassage2() throws IOException{ assertEquals(1, qrels.getRelevanceGrade("961297", "msmarco_passage_18_858458289")); } + @Test + public void testMsmarcoV2DocDev() throws IOException{ + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt"); + assertNotNull(qrels); + assertEquals(4552, qrels.getQids().size()); + assertEquals(4702, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121")); + assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348")); + + qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV); + assertNotNull(qrels); + assertEquals(4552, qrels.getQids().size()); + assertEquals(4702, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121")); + assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348")); + } + + @Test + public void testMsmarcoV2DocDev2() throws IOException{ + RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev2.txt"); + assertNotNull(qrels); + assertEquals(5000, qrels.getQids().size()); + assertEquals(5178, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062")); + assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607")); + + qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV2); + assertNotNull(qrels); + assertEquals(5000, qrels.getQids().size()); + assertEquals(5178, getQrelsCount(qrels)); + assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062")); + assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607")); + } + @Test public void testCore17() throws IOException{ RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.core17.txt"); diff --git a/src/test/java/io/anserini/index/PrebuiltIndexTest.java b/src/test/java/io/anserini/index/PrebuiltIndexTest.java index 3313761b8e..c1c0ff18dc 100644 --- a/src/test/java/io/anserini/index/PrebuiltIndexTest.java +++ b/src/test/java/io/anserini/index/PrebuiltIndexTest.java @@ -60,6 +60,6 @@ public void testUrls() { // test number of prebuilt-indexes @Test public void testNumPrebuiltIndexes() { - assertEquals(128, IndexInfo.values().length); + assertEquals(130, IndexInfo.values().length); } } diff --git a/tools b/tools index 6841ccdbe4..ab5102721d 160000 --- a/tools +++ b/tools @@ -1 +1 @@ -Subproject commit 6841ccdbe4ca6d39549c794396d365d68d279715 +Subproject commit ab5102721dbaf01c00e190fb342aa0cf83f7c9b5