Refactor regressions (#1671)

+ "threads" and "batch-size" parameters: + consistent formatting + settable externally in code (rather than hard-coded in yaml) + better logging information for running times + run_jobs_with_load.py won't spawn too many concurrent jobs, configurable parameter.
castorini · Oct 13, 2023 · f1d623c · f1d623c
1 parent ce98af4
commit f1d623c
Show file tree

Hide file tree

Showing 22 changed files with 1,700 additions and 1,214 deletions.
diff --git a/docs/2cr/beir.html b/docs/2cr/beir.html
diff --git a/docs/2cr/miracl.html b/docs/2cr/miracl.html
diff --git a/docs/2cr/mrtydi.html b/docs/2cr/mrtydi.html
diff --git a/docs/2cr/msmarco-v1-doc.html b/docs/2cr/msmarco-v1-doc.html
diff --git a/docs/2cr/msmarco-v1-passage.html b/docs/2cr/msmarco-v1-passage.html
@@ -2447,7 +2447,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 <td>0.7280</td>
 <td>0.9069</td>
 <td></td>
-<td>0.3300</td>
+<td>0.3301</td>
 <td>0.9811</td>
 </tr>
 <tr class="hide-table-padding">
@@ -5324,7 +5324,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 --batch 128 \
+  --threads 16 --batch-size 128 \
   --index msmarco-v1-passage-slimr \
   --topics dl19-passage \
   --encoder castorini/slimr-msmarco-passage \
@@ -5350,7 +5350,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 --batch 128 \
+  --threads 16 --batch-size 128 \
   --index msmarco-v1-passage-slimr \
   --topics dl20 \
   --encoder castorini/slimr-msmarco-passage \
@@ -5376,7 +5376,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 --batch 128 \
+  --threads 16 --batch-size 128 \
   --index msmarco-v1-passage-slimr \
   --topics msmarco-passage-dev-subset \
   --encoder castorini/slimr-msmarco-passage \
@@ -5442,7 +5442,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 --batch 128 \
+  --threads 16 --batch-size 128 \
   --index msmarco-v1-passage-slimr-pp \
   --topics dl19-passage \
   --encoder castorini/slimr-pp-msmarco-passage \
@@ -5468,7 +5468,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 --batch 128 \
+  --threads 16 --batch-size 128 \
   --index msmarco-v1-passage-slimr-pp \
   --topics dl20 \
   --encoder castorini/slimr-pp-msmarco-passage \
@@ -5494,7 +5494,7 @@ <h1 class="mb-3">MS MARCO V1 Passage</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 --batch 128 \
+  --threads 16 --batch-size 128 \
   --index msmarco-v1-passage-slimr-pp \
   --topics msmarco-passage-dev-subset \
   --encoder castorini/slimr-pp-msmarco-passage \

diff --git a/docs/2cr/msmarco-v2-passage.html b/docs/2cr/msmarco-v2-passage.html
@@ -1464,17 +1464,17 @@ <h1 class="mb-3">MS MARCO V2 Passage</h1>
 <td class="expand-button"></td>
 <td></td>
 <td style="min-width: 400px">SLIM++ (norefine, tau=0.5, min_idf=1)</td>
-<td>0.2819</td>
-<td>0.6340</td>
+<td>0.2820</td>
+<td>0.6337</td>
 <td>0.7554</td>
-<td>0.5092</td>
-<td>0.8392</td>
+<td>0.5093</td>
+<td>0.8389</td>
 <td></td>
 <td>0.1915</td>
-<td>0.8707</td>
+<td>0.8710</td>
 <td></td>
-<td>0.1904</td>
-<td>0.8683</td>
+<td>0.1901</td>
+<td>0.8681</td>
 </tr>
 <tr class="hide-table-padding">
 <td></td>

diff --git a/docs/2cr/odqa.html b/docs/2cr/odqa.html
@@ -188,8 +188,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics dpr-trivia-test \
   --output run.odqa.BM25-k1_0.9_b_0.4.dpr-trivia-test.hits-100.txt \
@@ -221,8 +220,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics nq-test \
   --output run.odqa.BM25-k1_0.9_b_0.4.nq-test.hits-100.txt \
@@ -288,8 +286,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics dpr-trivia-test \
   --output run.odqa.BM25-k1_0.9_b_0.4_dpr-topics.dpr-trivia-test.hits-100.txt \
@@ -321,8 +318,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics dpr-nq-test \
   --output run.odqa.BM25-k1_0.9_b_0.4_dpr-topics.dpr-nq-test.hits-100.txt \
@@ -388,8 +384,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics dpr-trivia-test-gar-t5-answers \
   --output run.odqa.GarT5-RRF.dpr-trivia-test.answers.hits-1000.txt \
@@ -399,8 +394,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics dpr-trivia-test-gar-t5-titles \
   --output run.odqa.GarT5-RRF.dpr-trivia-test.titles.hits-1000.txt \
@@ -410,8 +404,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics dpr-trivia-test-gar-t5-sentences \
   --output run.odqa.GarT5-RRF.dpr-trivia-test.sentences.hits-1000.txt \
@@ -453,8 +446,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics nq-test-gar-t5-answers \
   --output run.odqa.GarT5-RRF.nq-test.answers.hits-1000.txt \
@@ -464,8 +456,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics nq-test-gar-t5-titles \
   --output run.odqa.GarT5-RRF.nq-test.titles.hits-1000.txt \
@@ -475,8 +466,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.lucene \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 128 \
   --index wikipedia-dpr-100w \
   --topics nq-test-gar-t5-sentences \
   --output run.odqa.GarT5-RRF.nq-test.sentences.hits-1000.txt \
@@ -552,8 +542,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.faiss \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 512 \
   --index wikipedia-dpr-100w.dpr-multi \
   --encoder facebook/dpr-question_encoder-multiset-base \
   --topics dpr-trivia-test \
@@ -585,8 +574,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.faiss \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 512 \
   --index wikipedia-dpr-100w.dpr-single-nq \
   --encoder facebook/dpr-question_encoder-single-nq-base \
   --topics nq-test \
@@ -652,8 +640,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.faiss \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 512 \
   --index wikipedia-dpr-100w.dkrr-tqa \
   --encoder castorini/dkrr-dpr-tqa-retriever \
   --topics dpr-trivia-test \
@@ -685,8 +672,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
 
   <blockquote class="mycode">
 <pre><code>python -m pyserini.search.faiss \
-  --threads 16 \
-  --batch-size 512 \
+  --threads 16 --batch-size 512 \
   --index wikipedia-dpr-100w.dkrr-nq \
   --encoder castorini/dkrr-dpr-nq-retriever \
   --topics nq-test \
@@ -758,8 +744,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
  fusion --alpha 0.95 \
  run	--topics dpr-trivia-test \
 	--output run.odqa.DPR-Hybrid.dpr-trivia-test.hits-100.txt \
-	--threads 16 \
-	--batch-size 512 \
+	--threads 16 --batch-size 512 \
 	--hits 1000
 </code></pre></blockquote>
 
@@ -793,8 +778,7 @@ <h1 class="mb-3">Retrieval for Open-Domain QA Datasets</h1>
  fusion --alpha 1.2 \
  run	--topics nq-test \
 	--output run.odqa.DPR-Hybrid.nq-test.hits-100.txt \
-	--threads 16 \
-	--batch-size 512 \
+	--threads 16 --batch-size 512 \
 	--hits 1000
 </code></pre></blockquote>
 

diff --git a/pyserini/2cr/beir.py b/pyserini/2cr/beir.py
@@ -17,16 +17,22 @@
 import argparse
 import math
 import os
-import time
 import sys
+import time
 from collections import defaultdict
+from datetime import datetime
 from string import Template
-import pkg_resources
 
+import pkg_resources
 import yaml
 
 from ._base import run_eval_and_return_metric, ok_str, fail_str
 
+dense_threads = 16
+dense_batch_size = 512
+sparse_threads = 16
+sparse_batch_size = 128
+
 trec_eval_metric_definitions = {
     'nDCG@10': '-c -m ndcg_cut.10',
     'R@100': '-c -m recall.100',
@@ -67,10 +73,11 @@
 
 def format_run_command(raw):
     return raw.replace('--topics', '\\\n  --topics') \
+        .replace('--threads', '\\\n  --threads') \
         .replace('--index', '\\\n  --index') \
-        .replace('--encoder-class', '\\\n --encoder-class') \
+        .replace('--encoder-class', '\\\n  --encoder-class') \
         .replace('--output ', '\\\n  --output ') \
-        .replace('--output-format trec', '\\\n  --output-format trec \\\n ') \
+        .replace('--output-format trec ', '\\\n  --output-format trec ') \
         .replace('--hits ', '\\\n  --hits ')
 
 
@@ -117,7 +124,9 @@ def generate_report(args):
                 dataset = datasets['dataset']
 
                 runfile = os.path.join(args.directory, f'run.beir.{name}.{dataset}.txt')
-                cmd = Template(cmd_template).substitute(dataset=dataset, output=runfile)
+                cmd = Template(cmd_template).substitute(dataset=dataset, output=runfile,
+                                                        sparse_threads=sparse_threads, sparse_batch_size=sparse_batch_size,
+                                                        dense_threads=dense_threads, dense_batch_size=dense_batch_size)
                 commands[dataset][name] = format_run_command(cmd)
 
                 for expected in datasets['scores']:
@@ -192,7 +201,9 @@ def run_conditions(args):
                 print(f'  - dataset: {dataset}')
 
                 runfile = os.path.join(args.directory, f'run.beir.{name}.{dataset}.txt')
-                cmd = Template(cmd_template).substitute(dataset=dataset, output=runfile)
+                cmd = Template(cmd_template).substitute(dataset=dataset, output=runfile,
+                                                        sparse_threads=sparse_threads, sparse_batch_size=sparse_batch_size,
+                                                        dense_threads=dense_threads, dense_batch_size=dense_batch_size)
 
                 if args.display_commands:
                     print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
@@ -216,6 +227,7 @@ def run_conditions(args):
                             table[dataset][name][metric] = score
                         else:
                             table[dataset][name][metric] = expected[metric]
+                    print('')
 
             print('')
 
@@ -264,7 +276,12 @@ def run_conditions(args):
 
     end = time.time()
 
+    start_str = datetime.utcfromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S')
+    end_str = datetime.utcfromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S')
+
     print('\n')
+    print(f'Start time: {start_str}')
+    print(f'End time: {end_str}')
     print(f'Total elapsed time: {end - start:.0f}s ~{(end - start)/3600:.1f}hr')
 
 

diff --git a/pyserini/2cr/beir.yaml b/pyserini/2cr/beir.yaml
@@ -1,6 +1,6 @@
 conditions:
   - name: bm25-flat
-    command: python -m pyserini.search.lucene --index beir-v1.0.0-${dataset}.flat --topics beir-v1.0.0-${dataset}-test --output $output --output-format trec --batch 36 --threads 12 --hits 1000 --bm25 --remove-query
+    command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index beir-v1.0.0-${dataset}.flat --topics beir-v1.0.0-${dataset}-test --output $output --output-format trec --hits 1000 --bm25 --remove-query
     datasets:
       - dataset: trec-covid
         scores:
@@ -148,7 +148,7 @@ conditions:
             R@100: 0.9253
             R@1000: 0.9767
   - name: bm25-multifield
-    command: python -m pyserini.search.lucene --index beir-v1.0.0-${dataset}.multifield --topics beir-v1.0.0-${dataset}-test --output $output --output-format trec --batch 36 --threads 12 --hits 1000 --bm25 --remove-query --fields contents=1.0 title=1.0
+    command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index beir-v1.0.0-${dataset}.multifield --topics beir-v1.0.0-${dataset}-test --output $output --output-format trec --hits 1000 --bm25 --remove-query --fields contents=1.0 title=1.0
     datasets:
       - dataset: trec-covid
         scores:
@@ -296,7 +296,7 @@ conditions:
             R@100: 0.9076
             R@1000: 0.9800
   - name: splade-distil-cocodenser-medium
-    command: python -m pyserini.search.lucene --index beir-v1.0.0-${dataset}-splade_distil_cocodenser_medium --topics beir-v1.0.0-${dataset}-test-splade_distil_cocodenser_medium --output $output --output-format trec --batch 36 --threads 12 --hits 1000 --impact --remove-query
+    command: python -m pyserini.search.lucene --threads ${sparse_threads} --batch-size ${sparse_batch_size} --index beir-v1.0.0-${dataset}-splade_distil_cocodenser_medium --topics beir-v1.0.0-${dataset}-test-splade_distil_cocodenser_medium --output $output --output-format trec --hits 1000 --impact --remove-query
     datasets:
       - dataset: trec-covid
         scores:
@@ -444,7 +444,7 @@ conditions:
             R@100: 0.9270
             R@1000: 0.9767
   - name: contriever
-    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/contriever --index beir-v1.0.0-${dataset}.contriever --topics beir-v1.0.0-${dataset}-test --output $output --batch 128 --threads 16 --hits 1000 --remove-query
+    command: python -m pyserini.search.faiss --threads ${dense_threads} --batch-size ${dense_batch_size} --encoder-class contriever --encoder facebook/contriever --index beir-v1.0.0-${dataset}.contriever --topics beir-v1.0.0-${dataset}-test --output $output --hits 1000 --remove-query
     datasets:
       - dataset: trec-covid
         scores:
@@ -592,7 +592,7 @@ conditions:
             R@100: 0.9260
             R@1000: 0.9967
   - name: contriever-msmarco
-    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/contriever-msmarco --index beir-v1.0.0-${dataset}.contriever-msmarco --topics beir-v1.0.0-${dataset}-test --output $output --batch 128 --threads 16 --hits 1000 --remove-query
+    command: python -m pyserini.search.faiss --threads ${dense_threads} --batch-size ${dense_batch_size} --encoder-class contriever --encoder facebook/contriever-msmarco --index beir-v1.0.0-${dataset}.contriever-msmarco --topics beir-v1.0.0-${dataset}-test --output $output --hits 1000 --remove-query
     datasets:
       - dataset: trec-covid
         scores: