Skip to content

Commit

Permalink
Refactoring (#2016)
Browse files Browse the repository at this point in the history
+ Gathered document_encoder_class_map and query_encoder_class_map
+ Fixed a bunch of "FutureWarning: clean_up_tokenization_spaces was not set..." warnings
+ Fixed "Some weights of the model checkpoint ... were not used when initializing" warnings
  • Loading branch information
lintool authored Oct 17, 2024
1 parent 75aea32 commit af2d3c5
Show file tree
Hide file tree
Showing 39 changed files with 2,635 additions and 158 deletions.
169 changes: 169 additions & 0 deletions bin/run-ance.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/bin/sh

date

## MS MARCO Passage

python -m pyserini.search.faiss \
--index msmarco-v1-passage.ance \
--topics msmarco-passage-dev-subset \
--encoded-queries ance-msmarco-passage-dev-subset \
--output runs/run.msmarco-passage.ance.tsv \
--output-format msmarco \
--batch-size 512 --threads 16

python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \
runs/run.msmarco-passage.ance.tsv

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-passage.ance.tsv \
--output runs/run.msmarco-passage.ance.trec

python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \
runs/run.msmarco-passage.ance.trec

## MS MARCO Document

python -m pyserini.search.faiss \
--index msmarco-v1-doc.ance-maxp \
--topics msmarco-doc-dev \
--encoded-queries ance_maxp-msmarco-doc-dev \
--output runs/run.msmarco-doc.passage.ance-maxp.txt \
--output-format msmarco \
--batch-size 512 --threads 16 \
--hits 1000 --max-passage --max-passage-hits 100

python -m pyserini.eval.msmarco_doc_eval \
--judgments msmarco-doc-dev \
--run runs/run.msmarco-doc.passage.ance-maxp.txt

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-doc.passage.ance-maxp.txt \
--output runs/run.msmarco-doc.passage.ance-maxp.trec

python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \
runs/run.msmarco-doc.passage.ance-maxp.trec

## Natural Questions (NQ)

python -m pyserini.search.faiss \
--index wikipedia-dpr-100w.ance-multi \
--topics dpr-nq-test \
--encoded-queries ance_multi-nq-test \
--output runs/run.ance.nq-test.multi.trec \
--batch-size 512 --threads 16

python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
--topics dpr-nq-test \
--index wikipedia-dpr \
--input runs/run.ance.nq-test.multi.trec \
--output runs/run.ance.nq-test.multi.json

python -m pyserini.eval.evaluate_dpr_retrieval \
--retrieval runs/run.ance.nq-test.multi.json \
--topk 20 100

## Trivia QA

python -m pyserini.search.faiss \
--index wikipedia-dpr-100w.ance-multi \
--topics dpr-trivia-test \
--encoded-queries ance_multi-trivia-test \
--output runs/run.ance.trivia-test.multi.trec \
--batch-size 512 --threads 16

python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
--topics dpr-trivia-test \
--index wikipedia-dpr \
--input runs/run.ance.trivia-test.multi.trec \
--output runs/run.ance.trivia-test.multi.json

python -m pyserini.eval.evaluate_dpr_retrieval \
--retrieval runs/run.ance.trivia-test.multi.json \
--topk 20 100

##
## Everything again, except with on-the-fly encoding
##

## MS MARCO Passage

python -m pyserini.search.faiss \
--index msmarco-v1-passage.ance \
--topics msmarco-passage-dev-subset \
--encoder castorini/ance-msmarco-passage \
--output runs/run.msmarco-passage.ance.tsv \
--output-format msmarco \
--batch-size 512 --threads 16

python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \
runs/run.msmarco-passage.ance.tsv

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-passage.ance.tsv \
--output runs/run.msmarco-passage.ance.trec

python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \
runs/run.msmarco-passage.ance.trec

## MS MARCO Document

python -m pyserini.search.faiss \
--index msmarco-v1-doc.ance-maxp \
--topics msmarco-doc-dev \
--encoder castorini/ance-msmarco-doc-maxp \
--output runs/run.msmarco-doc.passage.ance-maxp.txt \
--output-format msmarco \
--batch-size 512 --threads 16 \
--hits 1000 --max-passage --max-passage-hits 100

python -m pyserini.eval.msmarco_doc_eval \
--judgments msmarco-doc-dev \
--run runs/run.msmarco-doc.passage.ance-maxp.txt

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-doc.passage.ance-maxp.txt \
--output runs/run.msmarco-doc.passage.ance-maxp.trec

python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \
runs/run.msmarco-doc.passage.ance-maxp.trec

## Natural Questions (NQ)

python -m pyserini.search.faiss \
--index wikipedia-dpr-100w.ance-multi \
--topics dpr-nq-test \
--encoder castorini/ance-dpr-question-multi \
--output runs/run.ance.nq-test.multi.trec \
--batch-size 512 --threads 16

python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
--topics dpr-nq-test \
--index wikipedia-dpr \
--input runs/run.ance.nq-test.multi.trec \
--output runs/run.ance.nq-test.multi.json

python -m pyserini.eval.evaluate_dpr_retrieval \
--retrieval runs/run.ance.nq-test.multi.json \
--topk 20 100

## Trivia QA

python -m pyserini.search.faiss \
--index wikipedia-dpr-100w.ance-multi \
--topics dpr-trivia-test \
--encoder castorini/ance-dpr-question-multi \
--output runs/run.ance.trivia-test.multi.trec \
--batch-size 512 --threads 16

python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
--topics dpr-trivia-test \
--index wikipedia-dpr \
--input runs/run.ance.trivia-test.multi.trec \
--output runs/run.ance.trivia-test.multi.json

python -m pyserini.eval.evaluate_dpr_retrieval \
--retrieval runs/run.ance.trivia-test.multi.json \
--topk 20 100

date
24 changes: 24 additions & 0 deletions bin/run-bpr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh

date

python -m pyserini.search.faiss \
--index wikipedia-dpr-100w.bpr-single-nq \
--topics dpr-nq-test \
--encoded-queries bpr_single_nq-nq-test \
--output runs/run.bpr.rerank.nq-test.nq.hash.trec \
--batch-size 512 --threads 16 \
--hits 100 --binary-hits 1000 \
--searcher bpr --rerank

python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
--index wikipedia-dpr \
--topics dpr-nq-test \
--input runs/run.bpr.rerank.nq-test.nq.hash.trec \
--output runs/run.bpr.rerank.nq-test.nq.hash.json

python -m pyserini.eval.evaluate_dpr_retrieval \
--retrieval runs/run.bpr.rerank.nq-test.nq.hash.json \
--topk 20 100

date
43 changes: 43 additions & 0 deletions bin/run-distillbert-kd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/sh

date

python -m pyserini.search.faiss \
--index msmarco-v1-passage.distilbert-dot-margin-mse-t2 \
--topics msmarco-passage-dev-subset \
--encoded-queries distilbert_kd-msmarco-passage-dev-subset \
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \
--output-format msmarco \
--batch-size 512 --threads 16

python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec

python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec

###

python -m pyserini.search.faiss \
--index msmarco-v1-passage.distilbert-dot-margin-mse-t2 \
--topics msmarco-passage-dev-subset \
--encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \
--output-format msmarco \
--batch-size 512 --threads 16

python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \
--output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec

python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec

date
43 changes: 43 additions & 0 deletions bin/run-distillbert-tasb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/sh

date

python -m pyserini.search.faiss \
--index msmarco-v1-passage.distilbert-dot-tas_b-b256 \
--topics msmarco-passage-dev-subset \
--encoded-queries distilbert_tas_b-msmarco-passage-dev-subset \
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \
--output-format msmarco \
--batch-size 512 --threads 16

python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec

python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec

###

python -m pyserini.search.faiss \
--index msmarco-v1-passage.distilbert-dot-tas_b-b256 \
--topics msmarco-passage-dev-subset \
--encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \
--output-format msmarco \
--batch-size 512 --threads 16

python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv

python -m pyserini.eval.convert_msmarco_run_to_trec_run \
--input runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \
--output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec

python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \
runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec

date
Loading

0 comments on commit af2d3c5

Please sign in to comment.