-
Notifications
You must be signed in to change notification settings - Fork 385
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
+ Gathered document_encoder_class_map and query_encoder_class_map + Fixed a bunch of "FutureWarning: clean_up_tokenization_spaces was not set..." warnings + Fixed "Some weights of the model checkpoint ... were not used when initializing" warnings
- Loading branch information
Showing
39 changed files
with
2,635 additions
and
158 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
#!/bin/sh | ||
|
||
date | ||
|
||
## MS MARCO Passage | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-passage.ance \ | ||
--topics msmarco-passage-dev-subset \ | ||
--encoded-queries ance-msmarco-passage-dev-subset \ | ||
--output runs/run.msmarco-passage.ance.tsv \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.ance.tsv | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-passage.ance.tsv \ | ||
--output runs/run.msmarco-passage.ance.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.ance.trec | ||
|
||
## MS MARCO Document | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-doc.ance-maxp \ | ||
--topics msmarco-doc-dev \ | ||
--encoded-queries ance_maxp-msmarco-doc-dev \ | ||
--output runs/run.msmarco-doc.passage.ance-maxp.txt \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 \ | ||
--hits 1000 --max-passage --max-passage-hits 100 | ||
|
||
python -m pyserini.eval.msmarco_doc_eval \ | ||
--judgments msmarco-doc-dev \ | ||
--run runs/run.msmarco-doc.passage.ance-maxp.txt | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-doc.passage.ance-maxp.txt \ | ||
--output runs/run.msmarco-doc.passage.ance-maxp.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ | ||
runs/run.msmarco-doc.passage.ance-maxp.trec | ||
|
||
## Natural Questions (NQ) | ||
|
||
python -m pyserini.search.faiss \ | ||
--index wikipedia-dpr-100w.ance-multi \ | ||
--topics dpr-nq-test \ | ||
--encoded-queries ance_multi-nq-test \ | ||
--output runs/run.ance.nq-test.multi.trec \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ | ||
--topics dpr-nq-test \ | ||
--index wikipedia-dpr \ | ||
--input runs/run.ance.nq-test.multi.trec \ | ||
--output runs/run.ance.nq-test.multi.json | ||
|
||
python -m pyserini.eval.evaluate_dpr_retrieval \ | ||
--retrieval runs/run.ance.nq-test.multi.json \ | ||
--topk 20 100 | ||
|
||
## Trivia QA | ||
|
||
python -m pyserini.search.faiss \ | ||
--index wikipedia-dpr-100w.ance-multi \ | ||
--topics dpr-trivia-test \ | ||
--encoded-queries ance_multi-trivia-test \ | ||
--output runs/run.ance.trivia-test.multi.trec \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ | ||
--topics dpr-trivia-test \ | ||
--index wikipedia-dpr \ | ||
--input runs/run.ance.trivia-test.multi.trec \ | ||
--output runs/run.ance.trivia-test.multi.json | ||
|
||
python -m pyserini.eval.evaluate_dpr_retrieval \ | ||
--retrieval runs/run.ance.trivia-test.multi.json \ | ||
--topk 20 100 | ||
|
||
## | ||
## Everything again, except with on-the-fly encoding | ||
## | ||
|
||
## MS MARCO Passage | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-passage.ance \ | ||
--topics msmarco-passage-dev-subset \ | ||
--encoder castorini/ance-msmarco-passage \ | ||
--output runs/run.msmarco-passage.ance.tsv \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.ance.tsv | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-passage.ance.tsv \ | ||
--output runs/run.msmarco-passage.ance.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.ance.trec | ||
|
||
## MS MARCO Document | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-doc.ance-maxp \ | ||
--topics msmarco-doc-dev \ | ||
--encoder castorini/ance-msmarco-doc-maxp \ | ||
--output runs/run.msmarco-doc.passage.ance-maxp.txt \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 \ | ||
--hits 1000 --max-passage --max-passage-hits 100 | ||
|
||
python -m pyserini.eval.msmarco_doc_eval \ | ||
--judgments msmarco-doc-dev \ | ||
--run runs/run.msmarco-doc.passage.ance-maxp.txt | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-doc.passage.ance-maxp.txt \ | ||
--output runs/run.msmarco-doc.passage.ance-maxp.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ | ||
runs/run.msmarco-doc.passage.ance-maxp.trec | ||
|
||
## Natural Questions (NQ) | ||
|
||
python -m pyserini.search.faiss \ | ||
--index wikipedia-dpr-100w.ance-multi \ | ||
--topics dpr-nq-test \ | ||
--encoder castorini/ance-dpr-question-multi \ | ||
--output runs/run.ance.nq-test.multi.trec \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ | ||
--topics dpr-nq-test \ | ||
--index wikipedia-dpr \ | ||
--input runs/run.ance.nq-test.multi.trec \ | ||
--output runs/run.ance.nq-test.multi.json | ||
|
||
python -m pyserini.eval.evaluate_dpr_retrieval \ | ||
--retrieval runs/run.ance.nq-test.multi.json \ | ||
--topk 20 100 | ||
|
||
## Trivia QA | ||
|
||
python -m pyserini.search.faiss \ | ||
--index wikipedia-dpr-100w.ance-multi \ | ||
--topics dpr-trivia-test \ | ||
--encoder castorini/ance-dpr-question-multi \ | ||
--output runs/run.ance.trivia-test.multi.trec \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ | ||
--topics dpr-trivia-test \ | ||
--index wikipedia-dpr \ | ||
--input runs/run.ance.trivia-test.multi.trec \ | ||
--output runs/run.ance.trivia-test.multi.json | ||
|
||
python -m pyserini.eval.evaluate_dpr_retrieval \ | ||
--retrieval runs/run.ance.trivia-test.multi.json \ | ||
--topk 20 100 | ||
|
||
date |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/bin/sh | ||
|
||
date | ||
|
||
python -m pyserini.search.faiss \ | ||
--index wikipedia-dpr-100w.bpr-single-nq \ | ||
--topics dpr-nq-test \ | ||
--encoded-queries bpr_single_nq-nq-test \ | ||
--output runs/run.bpr.rerank.nq-test.nq.hash.trec \ | ||
--batch-size 512 --threads 16 \ | ||
--hits 100 --binary-hits 1000 \ | ||
--searcher bpr --rerank | ||
|
||
python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ | ||
--index wikipedia-dpr \ | ||
--topics dpr-nq-test \ | ||
--input runs/run.bpr.rerank.nq-test.nq.hash.trec \ | ||
--output runs/run.bpr.rerank.nq-test.nq.hash.json | ||
|
||
python -m pyserini.eval.evaluate_dpr_retrieval \ | ||
--retrieval runs/run.bpr.rerank.nq-test.nq.hash.json \ | ||
--topk 20 100 | ||
|
||
date |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/sh | ||
|
||
date | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-passage.distilbert-dot-margin-mse-t2 \ | ||
--topics msmarco-passage-dev-subset \ | ||
--encoded-queries distilbert_kd-msmarco-passage-dev-subset \ | ||
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ | ||
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec | ||
|
||
### | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-passage.distilbert-dot-margin-mse-t2 \ | ||
--topics msmarco-passage-dev-subset \ | ||
--encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ | ||
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ | ||
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec | ||
|
||
date |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/sh | ||
|
||
date | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-passage.distilbert-dot-tas_b-b256 \ | ||
--topics msmarco-passage-dev-subset \ | ||
--encoded-queries distilbert_tas_b-msmarco-passage-dev-subset \ | ||
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ | ||
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec | ||
|
||
### | ||
|
||
python -m pyserini.search.faiss \ | ||
--index msmarco-v1-passage.distilbert-dot-tas_b-b256 \ | ||
--topics msmarco-passage-dev-subset \ | ||
--encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ | ||
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ | ||
--output-format msmarco \ | ||
--batch-size 512 --threads 16 | ||
|
||
python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv | ||
|
||
python -m pyserini.eval.convert_msmarco_run_to_trec_run \ | ||
--input runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ | ||
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ | ||
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec | ||
|
||
date |
Oops, something went wrong.