From af2d3c52953b916e242142dbcf4799ecdb9abbee Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Thu, 17 Oct 2024 13:34:50 -0400 Subject: [PATCH] Refactoring (#2016) + Gathered document_encoder_class_map and query_encoder_class_map + Fixed a bunch of "FutureWarning: clean_up_tokenization_spaces was not set..." warnings + Fixed "Some weights of the model checkpoint ... were not used when initializing" warnings --- bin/run-ance.sh | 169 +++++ bin/run-bpr.sh | 24 + bin/run-distillbert-kd.sh | 43 ++ bin/run-distillbert-tasb.sh | 43 ++ bin/run-dkrr.sh | 161 ++++ bin/run-dpr-encoded.sh | 319 ++++++++ bin/run-dpr-otf.sh | 319 ++++++++ bin/run-sbert.sh | 85 +++ bin/run-tct-encoded.sh | 150 ++++ bin/run-tct-otf.sh | 150 ++++ bin/run-tct2-encoded.sh | 105 +++ bin/run-tct2-otf.sh | 158 ++++ bin/run-vector-prf.sh | 696 ++++++++++++++++++ pyserini/encode/__init__.py | 31 + pyserini/encode/__main__.py | 34 +- pyserini/encode/_aggretriever.py | 6 +- pyserini/encode/_ance.py | 8 +- pyserini/encode/_arctic.py | 7 +- pyserini/encode/_auto.py | 14 +- pyserini/encode/_bpr.py | 3 +- pyserini/encode/_clip.py | 6 +- pyserini/encode/_cosdpr.py | 8 +- pyserini/encode/_dkrr.py | 3 +- pyserini/encode/_dpr.py | 29 +- pyserini/encode/_slim.py | 3 +- pyserini/encode/_splade.py | 3 +- pyserini/encode/_tct_colbert.py | 9 +- pyserini/encode/_tok_freq.py | 3 +- pyserini/encode/_unicoil.py | 2 +- pyserini/search/faiss/__main__.py | 25 +- pyserini/search/faiss/_searcher.py | 4 +- pyserini/search/lucene/__main__.py | 3 +- pyserini/search/lucene/irst/__main__.py | 10 +- pyserini/search/lucene/irst/_searcher.py | 5 +- pyserini/search/lucene/ltr/__main__.py | 3 +- pyserini/tokenize_json_collection.py | 4 +- scripts/jobs.docs-all.txt | 12 + .../{test_encoder.py => test_encode.py} | 24 +- tests/test_tokenization.py | 112 +-- 39 files changed, 2635 insertions(+), 158 deletions(-) create mode 100755 bin/run-ance.sh create mode 100755 bin/run-bpr.sh create mode 100755 bin/run-distillbert-kd.sh create mode 100755 bin/run-distillbert-tasb.sh create mode 100755 bin/run-dkrr.sh create mode 100755 bin/run-dpr-encoded.sh create mode 100755 bin/run-dpr-otf.sh create mode 100755 bin/run-sbert.sh create mode 100755 bin/run-tct-encoded.sh create mode 100755 bin/run-tct-otf.sh create mode 100755 bin/run-tct2-encoded.sh create mode 100755 bin/run-tct2-otf.sh create mode 100755 bin/run-vector-prf.sh create mode 100644 scripts/jobs.docs-all.txt rename tests-optional/{test_encoder.py => test_encode.py} (94%) diff --git a/bin/run-ance.sh b/bin/run-ance.sh new file mode 100755 index 000000000..e81443707 --- /dev/null +++ b/bin/run-ance.sh @@ -0,0 +1,169 @@ +#!/bin/sh + +date + +## MS MARCO Passage + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.ance \ + --topics msmarco-passage-dev-subset \ + --encoded-queries ance-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.ance.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.ance.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.ance.tsv \ + --output runs/run.msmarco-passage.ance.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.ance.trec + +## MS MARCO Document + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc.ance-maxp \ + --topics msmarco-doc-dev \ + --encoded-queries ance_maxp-msmarco-doc-dev \ + --output runs/run.msmarco-doc.passage.ance-maxp.txt \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.passage.ance-maxp.txt + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.passage.ance-maxp.txt \ + --output runs/run.msmarco-doc.passage.ance-maxp.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.passage.ance-maxp.trec + +## Natural Questions (NQ) + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.ance-multi \ + --topics dpr-nq-test \ + --encoded-queries ance_multi-nq-test \ + --output runs/run.ance.nq-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-nq-test \ + --index wikipedia-dpr \ + --input runs/run.ance.nq-test.multi.trec \ + --output runs/run.ance.nq-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.ance.nq-test.multi.json \ + --topk 20 100 + +## Trivia QA + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.ance-multi \ + --topics dpr-trivia-test \ + --encoded-queries ance_multi-trivia-test \ + --output runs/run.ance.trivia-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-trivia-test \ + --index wikipedia-dpr \ + --input runs/run.ance.trivia-test.multi.trec \ + --output runs/run.ance.trivia-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.ance.trivia-test.multi.json \ + --topk 20 100 + +## +## Everything again, except with on-the-fly encoding +## + +## MS MARCO Passage + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.ance \ + --topics msmarco-passage-dev-subset \ + --encoder castorini/ance-msmarco-passage \ + --output runs/run.msmarco-passage.ance.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.ance.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.ance.tsv \ + --output runs/run.msmarco-passage.ance.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.ance.trec + +## MS MARCO Document + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc.ance-maxp \ + --topics msmarco-doc-dev \ + --encoder castorini/ance-msmarco-doc-maxp \ + --output runs/run.msmarco-doc.passage.ance-maxp.txt \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.passage.ance-maxp.txt + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.passage.ance-maxp.txt \ + --output runs/run.msmarco-doc.passage.ance-maxp.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.passage.ance-maxp.trec + +## Natural Questions (NQ) + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.ance-multi \ + --topics dpr-nq-test \ + --encoder castorini/ance-dpr-question-multi \ + --output runs/run.ance.nq-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-nq-test \ + --index wikipedia-dpr \ + --input runs/run.ance.nq-test.multi.trec \ + --output runs/run.ance.nq-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.ance.nq-test.multi.json \ + --topk 20 100 + +## Trivia QA + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.ance-multi \ + --topics dpr-trivia-test \ + --encoder castorini/ance-dpr-question-multi \ + --output runs/run.ance.trivia-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-trivia-test \ + --index wikipedia-dpr \ + --input runs/run.ance.trivia-test.multi.trec \ + --output runs/run.ance.trivia-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.ance.trivia-test.multi.json \ + --topk 20 100 + +date diff --git a/bin/run-bpr.sh b/bin/run-bpr.sh new file mode 100755 index 000000000..e7dd42a44 --- /dev/null +++ b/bin/run-bpr.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +date + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.bpr-single-nq \ + --topics dpr-nq-test \ + --encoded-queries bpr_single_nq-nq-test \ + --output runs/run.bpr.rerank.nq-test.nq.hash.trec \ + --batch-size 512 --threads 16 \ + --hits 100 --binary-hits 1000 \ + --searcher bpr --rerank + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr \ + --topics dpr-nq-test \ + --input runs/run.bpr.rerank.nq-test.nq.hash.trec \ + --output runs/run.bpr.rerank.nq-test.nq.hash.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.bpr.rerank.nq-test.nq.hash.json \ + --topk 20 100 + +date diff --git a/bin/run-distillbert-kd.sh b/bin/run-distillbert-kd.sh new file mode 100755 index 000000000..4ac416f20 --- /dev/null +++ b/bin/run-distillbert-kd.sh @@ -0,0 +1,43 @@ +#!/bin/sh + +date + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 \ + --topics msmarco-passage-dev-subset \ + --encoded-queries distilbert_kd-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ + --output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec + +### + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 \ + --topics msmarco-passage-dev-subset \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.tsv \ + --output runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-margin_mse-t2.trec + +date diff --git a/bin/run-distillbert-tasb.sh b/bin/run-distillbert-tasb.sh new file mode 100755 index 000000000..37c40d3d7 --- /dev/null +++ b/bin/run-distillbert-tasb.sh @@ -0,0 +1,43 @@ +#!/bin/sh + +date + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.distilbert-dot-tas_b-b256 \ + --topics msmarco-passage-dev-subset \ + --encoded-queries distilbert_tas_b-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ + --output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec + +### + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.distilbert-dot-tas_b-b256 \ + --topics msmarco-passage-dev-subset \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.distilbert-dot-tas_b-b256.tsv \ + --output runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.distilbert-dot-tas_b-b256.trec + +date diff --git a/bin/run-dkrr.sh b/bin/run-dkrr.sh new file mode 100755 index 000000000..fdc4d144e --- /dev/null +++ b/bin/run-dkrr.sh @@ -0,0 +1,161 @@ +#!/bin/sh + +date + +## Natural Questions + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-nq \ + --topics dpr-nq-dev \ + --encoded-queries dkrr-dpr-nq-retriever-dpr-nq-dev \ + --output runs/run.dpr-dkrr-nq.dev.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-nq \ + --topics nq-test \ + --encoded-queries dkrr-dpr-nq-retriever-nq-test \ + --output runs/run.dpr-dkrr-nq.test.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-nq-dev \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-nq.dev.trec \ + --output runs/run.dpr-dkrr-nq.dev.json + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics nq-test \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-nq.test.trec \ + --output runs/run.dpr-dkrr-nq.test.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-nq.dev.json \ + --topk 5 20 100 500 1000 + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-nq.test.json \ + --topk 5 20 100 500 1000 + +## TriviaQA (TQA) + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-tqa \ + --topics dpr-trivia-dev \ + --encoded-queries dkrr-dpr-tqa-retriever-dpr-tqa-dev \ + --output runs/run.dpr-dkrr-trivia.dev.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-tqa \ + --topics dpr-trivia-test \ + --encoded-queries dkrr-dpr-tqa-retriever-dpr-tqa-test \ + --output runs/run.dpr-dkrr-trivia.test.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-trivia-dev \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-trivia.dev.trec \ + --output runs/run.dpr-dkrr-trivia.dev.json + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-trivia-test \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-trivia.test.trec \ + --output runs/run.dpr-dkrr-trivia.test.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-trivia.dev.json \ + --topk 5 20 100 500 1000 + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-trivia.test.json \ + --topk 5 20 100 500 1000 + +## +## Everything again, except with on-the-fly encoding +## + +## Natural Questions + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-nq \ + --topics dpr-nq-dev \ + --encoder castorini/dkrr-dpr-nq-retriever \ + --output runs/run.dpr-dkrr-nq.dev.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-nq \ + --topics nq-test \ + --encoder castorini/dkrr-dpr-nq-retriever \ + --output runs/run.dpr-dkrr-nq.test.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-nq-dev \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-nq.dev.trec \ + --output runs/run.dpr-dkrr-nq.dev.json + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics nq-test \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-nq.test.trec \ + --output runs/run.dpr-dkrr-nq.test.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-nq.dev.json \ + --topk 5 20 100 500 1000 + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-nq.test.json \ + --topk 5 20 100 500 1000 + +## TriviaQA (TQA) + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-tqa \ + --topics dpr-trivia-dev \ + --encoder castorini/dkrr-dpr-tqa-retriever \ + --output runs/run.dpr-dkrr-trivia.dev.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dkrr-tqa \ + --topics dpr-trivia-test \ + --encoder castorini/dkrr-dpr-tqa-retriever \ + --output runs/run.dpr-dkrr-trivia.test.trec \ + --query-prefix question: \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-trivia-dev \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-trivia.dev.trec \ + --output runs/run.dpr-dkrr-trivia.dev.json + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-trivia-test \ + --index wikipedia-dpr \ + --input runs/run.dpr-dkrr-trivia.test.trec \ + --output runs/run.dpr-dkrr-trivia.test.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-trivia.dev.json \ + --topk 5 20 100 500 1000 + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.dpr-dkrr-trivia.test.json \ + --topk 5 20 100 500 1000 + +date diff --git a/bin/run-dpr-encoded.sh b/bin/run-dpr-encoded.sh new file mode 100755 index 000000000..d869044be --- /dev/null +++ b/bin/run-dpr-encoded.sh @@ -0,0 +1,319 @@ +#!/bin/sh + +date + +## Natural Questions (NQ) with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-nq-test \ + --encoded-queries dpr_multi-nq-test \ + --output runs/run.encoded.dpr.nq-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.encoded.dpr.nq-test.multi.trec \ + --output runs/run.encoded.dpr.nq-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.nq-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --output runs/run.encoded.dpr.nq-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.encoded.dpr.nq-test.bm25.trec \ + --output runs/run.encoded.dpr.nq-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.nq-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoded-queries dpr_multi-nq-test \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 1.3 \ + run --topics dpr-nq-test \ + --output runs/run.encoded.dpr.nq-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.encoded.dpr.nq-test.multi.bm25.trec \ + --output runs/run.encoded.dpr.nq-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.nq-test.multi.bm25.json \ + --topk 20 100 + + +## TriviaQA with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-trivia-test \ + --encoded-queries dpr_multi-trivia-test \ + --output runs/run.encoded.dpr.trivia-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --input runs/run.encoded.dpr.trivia-test.multi.trec \ + --output runs/run.encoded.dpr.trivia-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.trivia-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --output runs/run.encoded.dpr.trivia-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --input runs/run.encoded.dpr.trivia-test.bm25.trec \ + --output runs/run.encoded.dpr.trivia-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.trivia-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoded-queries dpr_multi-trivia-test \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 0.95 \ + run --topics dpr-trivia-test \ + --output runs/run.encoded.dpr.trivia-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --input runs/run.encoded.dpr.trivia-test.multi.bm25.trec \ + --output runs/run.encoded.dpr.trivia-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.trivia-test.multi.bm25.json \ + --topk 20 100 + + +## WebQuestions (WQ) with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-wq-test \ + --encoded-queries dpr_multi-wq-test \ + --output runs/run.encoded.dpr.wq-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --input runs/run.encoded.dpr.wq-test.multi.trec \ + --output runs/run.encoded.dpr.wq-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.wq-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --output runs/run.encoded.dpr.wq-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --input runs/run.encoded.dpr.wq-test.bm25.trec \ + --output runs/run.encoded.dpr.wq-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.wq-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoded-queries dpr_multi-wq-test \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 0.95 \ + run --topics dpr-wq-test \ + --output runs/run.encoded.dpr.wq-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --input runs/run.encoded.dpr.wq-test.multi.bm25.trec \ + --output runs/run.encoded.dpr.wq-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.wq-test.multi.bm25.json \ + --topk 20 100 + + +## CuratedTREC with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-curated-test \ + --encoded-queries dpr_multi-curated-test \ + --output runs/run.encoded.dpr.curated-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --input runs/run.encoded.dpr.curated-test.multi.trec \ + --output runs/run.encoded.dpr.curated-test.multi.json \ + --regex + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.curated-test.multi.json \ + --topk 20 100 \ + --regex + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --output runs/run.encoded.dpr.curated-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --input runs/run.encoded.dpr.curated-test.bm25.trec \ + --output runs/run.encoded.dpr.curated-test.bm25.json \ + --regex + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.curated-test.bm25.json \ + --topk 20 100 \ + --regex + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoded-queries dpr_multi-curated-test \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 1.05 \ + run --topics dpr-curated-test \ + --output runs/run.encoded.dpr.curated-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --input runs/run.encoded.dpr.curated-test.multi.bm25.trec \ + --output runs/run.encoded.dpr.curated-test.multi.bm25.json \ + --regex + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.curated-test.multi.bm25.json \ + --topk 20 100 \ + --regex + + +## SQuAD with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-squad-test \ + --encoded-queries dpr_multi-squad-test \ + --output runs/run.encoded.dpr.squad-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --input runs/run.encoded.dpr.squad-test.multi.trec \ + --output runs/run.encoded.dpr.squad-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.squad-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --output runs/run.encoded.dpr.squad-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --input runs/run.encoded.dpr.squad-test.bm25.trec \ + --output runs/run.encoded.dpr.squad-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.squad-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoded-queries dpr_multi-squad-test \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 2.00 \ + run --topics dpr-squad-test \ + --output runs/run.encoded.dpr.squad-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --input runs/run.encoded.dpr.squad-test.multi.bm25.trec \ + --output runs/run.encoded.dpr.squad-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.squad-test.multi.bm25.json \ + --topk 20 100 + + +## Natural Questions (NQ) with DPR-Single + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-single-nq \ + --topics dpr-nq-test \ + --encoded-queries dpr_single_nq-nq-test \ + --output runs/run.encoded.dpr.nq-test.single.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.encoded.dpr.nq-test.single.trec \ + --output runs/run.encoded.dpr.nq-test.single.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.nq-test.single.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-single-nq \ + --encoded-queries dpr_single_nq-nq-test \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 1.2 \ + run --topics dpr-nq-test \ + --output runs/run.encoded.dpr.nq-test.single.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-nq-test \ + --index wikipedia-dpr-100w \ + --input runs/run.encoded.dpr.nq-test.single.bm25.trec \ + --output runs/run.encoded.dpr.nq-test.single.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.encoded.dpr.nq-test.single.bm25.json \ + --topk 20 100 + +date diff --git a/bin/run-dpr-otf.sh b/bin/run-dpr-otf.sh new file mode 100755 index 000000000..abeae5a02 --- /dev/null +++ b/bin/run-dpr-otf.sh @@ -0,0 +1,319 @@ +#!/bin/sh + +date + +## Natural Questions (NQ) with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-nq-test \ + --encoder facebook/dpr-question_encoder-multiset-base \ + --output runs/run.otf.dpr.nq-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.otf.dpr.nq-test.multi.trec \ + --output runs/run.otf.dpr.nq-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.nq-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --output runs/run.otf.dpr.nq-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.otf.dpr.nq-test.bm25.trec \ + --output runs/run.otf.dpr.nq-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.nq-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoder facebook/dpr-question_encoder-multiset-base \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 1.3 \ + run --topics dpr-nq-test \ + --output runs/run.otf.dpr.nq-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.otf.dpr.nq-test.multi.bm25.trec \ + --output runs/run.otf.dpr.nq-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.nq-test.multi.bm25.json \ + --topk 20 100 + + +## TriviaQA with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-trivia-test \ + --encoder facebook/dpr-question_encoder-multiset-base \ + --output runs/run.otf.dpr.trivia-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --input runs/run.otf.dpr.trivia-test.multi.trec \ + --output runs/run.otf.dpr.trivia-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.trivia-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --output runs/run.otf.dpr.trivia-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --input runs/run.otf.dpr.trivia-test.bm25.trec \ + --output runs/run.otf.dpr.trivia-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.trivia-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoder facebook/dpr-question_encoder-multiset-base \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 0.95 \ + run --topics dpr-trivia-test \ + --output runs/run.otf.dpr.trivia-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-trivia-test \ + --input runs/run.otf.dpr.trivia-test.multi.bm25.trec \ + --output runs/run.otf.dpr.trivia-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.trivia-test.multi.bm25.json \ + --topk 20 100 + + +## WebQuestions (WQ) with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-wq-test \ + --encoder facebook/dpr-question_encoder-multiset-base \ + --output runs/run.otf.dpr.wq-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --input runs/run.otf.dpr.wq-test.multi.trec \ + --output runs/run.otf.dpr.wq-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.wq-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --output runs/run.otf.dpr.wq-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --input runs/run.otf.dpr.wq-test.bm25.trec \ + --output runs/run.otf.dpr.wq-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.wq-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoder facebook/dpr-question_encoder-multiset-base \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 0.95 \ + run --topics dpr-wq-test \ + --output runs/run.otf.dpr.wq-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-wq-test \ + --input runs/run.otf.dpr.wq-test.multi.bm25.trec \ + --output runs/run.otf.dpr.wq-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.wq-test.multi.bm25.json \ + --topk 20 100 + + +## CuratedTREC with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-curated-test \ + --encoder facebook/dpr-question_encoder-multiset-base \ + --output runs/run.otf.dpr.curated-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --input runs/run.otf.dpr.curated-test.multi.trec \ + --output runs/run.otf.dpr.curated-test.multi.json \ + --regex + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.curated-test.multi.json \ + --topk 20 100 \ + --regex + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --output runs/run.otf.dpr.curated-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --input runs/run.otf.dpr.curated-test.bm25.trec \ + --output runs/run.otf.dpr.curated-test.bm25.json \ + --regex + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.curated-test.bm25.json \ + --topk 20 100 \ + --regex + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoder facebook/dpr-question_encoder-multiset-base \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 1.05 \ + run --topics dpr-curated-test \ + --output runs/run.otf.dpr.curated-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-curated-test \ + --input runs/run.otf.dpr.curated-test.multi.bm25.trec \ + --output runs/run.otf.dpr.curated-test.multi.bm25.json \ + --regex + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.curated-test.multi.bm25.json \ + --topk 20 100 \ + --regex + + +## SQuAD with DPR-Multi + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-multi \ + --topics dpr-squad-test \ + --encoder facebook/dpr-question_encoder-multiset-base \ + --output runs/run.otf.dpr.squad-test.multi.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --input runs/run.otf.dpr.squad-test.multi.trec \ + --output runs/run.otf.dpr.squad-test.multi.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.squad-test.multi.json \ + --topk 20 100 + +python -m pyserini.search.lucene \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --output runs/run.otf.dpr.squad-test.bm25.trec + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --input runs/run.otf.dpr.squad-test.bm25.trec \ + --output runs/run.otf.dpr.squad-test.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.squad-test.bm25.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-multi \ + --encoder facebook/dpr-question_encoder-multiset-base \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 2.00 \ + run --topics dpr-squad-test \ + --output runs/run.otf.dpr.squad-test.multi.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-squad-test \ + --input runs/run.otf.dpr.squad-test.multi.bm25.trec \ + --output runs/run.otf.dpr.squad-test.multi.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.squad-test.multi.bm25.json \ + --topk 20 100 + + +## Natural Questions (NQ) with DPR-Single + +python -m pyserini.search.faiss \ + --index wikipedia-dpr-100w.dpr-single-nq \ + --topics dpr-nq-test \ + --encoder facebook/dpr-question_encoder-single-nq-base \ + --output runs/run.otf.dpr.nq-test.single.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index wikipedia-dpr-100w \ + --topics dpr-nq-test \ + --input runs/run.otf.dpr.nq-test.single.trec \ + --output runs/run.otf.dpr.nq-test.single.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.nq-test.single.json \ + --topk 20 100 + +python -m pyserini.search.hybrid \ + dense --index wikipedia-dpr-100w.dpr-single-nq \ + --encoder facebook/dpr-question_encoder-single-nq-base \ + sparse --index wikipedia-dpr-100w \ + fusion --alpha 1.2 \ + run --topics dpr-nq-test \ + --output runs/run.otf.dpr.nq-test.single.bm25.trec \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --topics dpr-nq-test \ + --index wikipedia-dpr-100w \ + --input runs/run.otf.dpr.nq-test.single.bm25.trec \ + --output runs/run.otf.dpr.nq-test.single.bm25.json + +python -m pyserini.eval.evaluate_dpr_retrieval \ + --retrieval runs/run.otf.dpr.nq-test.single.bm25.json \ + --topk 20 100 + +date diff --git a/bin/run-sbert.sh b/bin/run-sbert.sh new file mode 100755 index 000000000..b3f4b3b59 --- /dev/null +++ b/bin/run-sbert.sh @@ -0,0 +1,85 @@ +#!/bin/sh + +date + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.sbert \ + --topics msmarco-passage-dev-subset \ + --encoded-queries sbert-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.sbert.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval \ + msmarco-passage-dev-subset runs/run.msmarco-passage.sbert.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.sbert.tsv \ + --output runs/run.msmarco-passage.sbert.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.sbert.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.sbert \ + --encoded-queries sbert-msmarco-passage-dev-subset \ + sparse --index msmarco-passage \ + fusion --alpha 0.015 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.sbert.bm25.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval \ + msmarco-passage-dev-subset runs/run.msmarco-passage.sbert.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.sbert.bm25.tsv \ + --output runs/run.msmarco-passage.sbert.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.sbert.bm25.trec + +## +## Everything again, except with on-the-fly encoding +## + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.sbert \ + --topics msmarco-passage-dev-subset \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --output runs/run.msmarco-passage.sbert.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval \ + msmarco-passage-dev-subset runs/run.msmarco-passage.sbert.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.sbert.tsv \ + --output runs/run.msmarco-passage.sbert.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.sbert.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.sbert \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + sparse --index msmarco-passage \ + fusion --alpha 0.015 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.sbert.bm25.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval \ + msmarco-passage-dev-subset runs/run.msmarco-passage.sbert.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.sbert.bm25.tsv \ + --output runs/run.msmarco-passage.sbert.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.sbert.bm25.trec + +date diff --git a/bin/run-tct-encoded.sh b/bin/run-tct-encoded.sh new file mode 100755 index 000000000..59f9b7896 --- /dev/null +++ b/bin/run-tct-encoded.sh @@ -0,0 +1,150 @@ +#!/bin/sh + +date + +## MS MARCO Passage Ranking + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert \ + --topics msmarco-passage-dev-subset \ + --encoded-queries tct_colbert-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.tsv \ + --output runs/run.msmarco-passage.tct_colbert.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.trec + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert.hnsw \ + --topics msmarco-passage-dev-subset \ + --encoded-queries tct_colbert-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert.hnsw.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.hnsw.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.hnsw.tsv \ + --output runs/run.msmarco-passage.tct_colbert.hnsw.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.hnsw.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert \ + --encoded-queries tct_colbert-msmarco-passage-dev-subset \ + sparse --index msmarco-v1-passage \ + fusion --alpha 0.12 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert.bm25.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.bm25.tsv \ + --output runs/run.msmarco-passage.tct_colbert.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.bm25.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert \ + --encoded-queries tct_colbert-msmarco-passage-dev-subset \ + sparse --index msmarco-v1-passage.d2q-t5 \ + fusion --alpha 0.22 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert.d2q-t5.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.d2q-t5.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.d2q-t5.tsv \ + --output runs/run.msmarco-passage.tct_colbert.d2q-t5.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.d2q-t5.trec + + +## MS MARCO Document Ranking + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc.tct_colbert \ + --topics msmarco-doc-dev \ + --encoded-queries tct_colbert-msmarco-doc-dev \ + --output runs/run.msmarco-doc.passage.tct_colbert.txt \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.passage.tct_colbert.txt + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.passage.tct_colbert.txt \ + --output runs/run.msmarco-doc.passage.tct_colbert.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.passage.tct_colbert.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-doc.tct_colbert \ + --encoded-queries tct_colbert-msmarco-doc-dev \ + sparse --index msmarco-v1-doc-segmented \ + fusion --alpha 0.25 \ + run --topics msmarco-doc-dev \ + --output runs/run.msmarco-doc.tct_colbert.bm25.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.tct_colbert.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.tct_colbert.bm25.tsv \ + --output runs/run.msmarco-doc.tct_colbert.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.tct_colbert.bm25.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-doc.tct_colbert \ + --encoded-queries tct_colbert-msmarco-doc-dev \ + sparse --index msmarco-v1-doc-segmented.d2q-t5 \ + fusion --alpha 0.32 \ + run --topics msmarco-doc-dev \ + --output runs/run.msmarco-doc.tct_colbert.d2q-t5.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.tct_colbert.d2q-t5.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.tct_colbert.d2q-t5.tsv \ + --output runs/run.msmarco-doc.tct_colbert.d2q-t5.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.tct_colbert.d2q-t5.trec + +date diff --git a/bin/run-tct-otf.sh b/bin/run-tct-otf.sh new file mode 100755 index 000000000..b08c6ac1c --- /dev/null +++ b/bin/run-tct-otf.sh @@ -0,0 +1,150 @@ +#!/bin/sh + +date + +## MS MARCO Passage Ranking + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert \ + --topics msmarco-passage-dev-subset \ + --encoder castorini/tct_colbert-msmarco \ + --output runs/run.msmarco-passage.tct_colbert.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.tsv \ + --output runs/run.msmarco-passage.tct_colbert.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.trec + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert.hnsw \ + --topics msmarco-passage-dev-subset \ + --encoder castorini/tct_colbert-msmarco \ + --output runs/run.msmarco-passage.tct_colbert.hnsw.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.hnsw.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.hnsw.tsv \ + --output runs/run.msmarco-passage.tct_colbert.hnsw.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.hnsw.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert \ + --encoder castorini/tct_colbert-msmarco \ + sparse --index msmarco-v1-passage \ + fusion --alpha 0.12 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert.bm25.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.bm25.tsv \ + --output runs/run.msmarco-passage.tct_colbert.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.bm25.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert \ + --encoder castorini/tct_colbert-msmarco \ + sparse --index msmarco-v1-passage.d2q-t5 \ + fusion --alpha 0.22 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert.d2q-t5.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.d2q-t5.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert.d2q-t5.tsv \ + --output runs/run.msmarco-passage.tct_colbert.d2q-t5.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert.d2q-t5.trec + + +## MS MARCO Document Ranking + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc.tct_colbert \ + --topics msmarco-doc-dev \ + --encoder castorini/tct_colbert-msmarco \ + --output runs/run.msmarco-doc.passage.tct_colbert.txt \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.passage.tct_colbert.txt + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.passage.tct_colbert.txt \ + --output runs/run.msmarco-doc.passage.tct_colbert.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.passage.tct_colbert.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-doc.tct_colbert \ + --encoder castorini/tct_colbert-msmarco \ + sparse --index msmarco-v1-doc-segmented \ + fusion --alpha 0.25 \ + run --topics msmarco-doc-dev \ + --output runs/run.msmarco-doc.tct_colbert.bm25.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.tct_colbert.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.tct_colbert.bm25.tsv \ + --output runs/run.msmarco-doc.tct_colbert.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.tct_colbert.bm25.trec + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-doc.tct_colbert \ + --encoder castorini/tct_colbert-msmarco \ + sparse --index msmarco-v1-doc-segmented.d2q-t5 \ + fusion --alpha 0.32 \ + run --topics msmarco-doc-dev \ + --output runs/run.msmarco-doc.tct_colbert.d2q-t5.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 \ + --hits 1000 --max-passage --max-passage-hits 100 + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.tct_colbert.d2q-t5.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.tct_colbert.d2q-t5.tsv \ + --output runs/run.msmarco-doc.tct_colbert.d2q-t5.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap msmarco-doc-dev \ + runs/run.msmarco-doc.tct_colbert.d2q-t5.trec + +date diff --git a/bin/run-tct2-encoded.sh b/bin/run-tct2-encoded.sh new file mode 100755 index 000000000..eee97ad2f --- /dev/null +++ b/bin/run-tct2-encoded.sh @@ -0,0 +1,105 @@ +#!/bin/sh + +date + +## MS MARCO Passage + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert-v2 \ + --topics msmarco-passage-dev-subset \ + --encoded-queries tct_colbert-v2-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert-v2.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2.trec + + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert-v2-hn \ + --topics msmarco-passage-dev-subset \ + --encoded-queries tct_colbert-v2-hn-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert-v2-hn.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hn.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hn.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hn.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hn.trec + + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert-v2-hnp \ + --topics msmarco-passage-dev-subset \ + --encoded-queries tct_colbert-v2-hnp-msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hnp.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.trec + + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert-v2-hnp \ + --encoded-queries tct_colbert-v2-hnp-msmarco-passage-dev-subset \ + sparse --index msmarco-v1-passage \ + fusion --alpha 0.06 \ + run --topics msmarco-passage-dev-subset \ + --output-format msmarco \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.tsv \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.trec + + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert-v2-hnp \ + --encoded-queries tct_colbert-v2-hnp-msmarco-passage-dev-subset \ + sparse --index msmarco-v1-passage.d2q-t5 \ + fusion --alpha 0.1 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.trec + +date diff --git a/bin/run-tct2-otf.sh b/bin/run-tct2-otf.sh new file mode 100755 index 000000000..352173ec5 --- /dev/null +++ b/bin/run-tct2-otf.sh @@ -0,0 +1,158 @@ +#!/bin/sh + +date + +## MS MARCO Passage + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert-v2 \ + --topics msmarco-passage-dev-subset \ + --encoder castorini/tct_colbert-v2-msmarco \ + --output runs/run.msmarco-passage.tct_colbert-v2.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2.trec + + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert-v2-hn \ + --topics msmarco-passage-dev-subset \ + --encoder castorini/tct_colbert-v2-hn-msmarco \ + --output runs/run.msmarco-passage.tct_colbert-v2-hn.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hn.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hn.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hn.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hn.trec + + +python -m pyserini.search.faiss \ + --index msmarco-v1-passage.tct_colbert-v2-hnp \ + --topics msmarco-passage-dev-subset \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hnp.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.trec + + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert-v2-hnp \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + sparse --index msmarco-v1-passage \ + fusion --alpha 0.06 \ + run --topics msmarco-passage-dev-subset \ + --output-format msmarco \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.tsv \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.bm25.trec + + +python -m pyserini.search.hybrid \ + dense --index msmarco-v1-passage.tct_colbert-v2-hnp \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + sparse --index msmarco-v1-passage.d2q-t5 \ + fusion --alpha 0.1 \ + run --topics msmarco-passage-dev-subset \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.tsv \ + --output-format msmarco \ + --batch-size 512 --threads 16 + +python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.tsv + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.tsv \ + --output runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.trec + +python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap msmarco-passage-dev-subset \ + runs/run.msmarco-passage.tct_colbert-v2-hnp.doc2queryT5.trec + + +## MS MARCO Doc + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc-segmented.tct_colbert-v2-hnp \ + --topics msmarco-doc-dev \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --output runs/run.msmarco-doc.passage.tct_colbert-v2-hnp-maxp.txt \ + --output-format msmarco \ + --hits 1000 \ + --max-passage \ + --max-passage-hits 100 \ + --batch-size 512 --threads 16 + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc-segmented.tct_colbert-v2-hnp \ + --topics dl19-doc \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --output runs/run.dl19-doc.passage.tct_colbert-v2-hnp-maxp.txt \ + --hits 1000 \ + --max-passage \ + --max-passage-hits 100 \ + --batch-size 512 --threads 16 + +python -m pyserini.search.faiss \ + --index msmarco-v1-doc-segmented.tct_colbert-v2-hnp \ + --topics dl20 \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --output runs/run.dl20-doc.passage.tct_colbert-v2-hnp-maxp.txt \ + --hits 1000 \ + --max-passage \ + --max-passage-hits 100 \ + --batch-size 512 --threads 16 + + +python -m pyserini.eval.msmarco_doc_eval \ + --judgments msmarco-doc-dev \ + --run runs/run.msmarco-doc.passage.tct_colbert-v2-hnp-maxp.txt + +python -m pyserini.eval.convert_msmarco_run_to_trec_run \ + --input runs/run.msmarco-doc.passage.tct_colbert-v2-hnp-maxp.txt \ + --output runs/run.msmarco-doc.passage.tct_colbert-v2-hnp-maxp.trec + +python -m pyserini.eval.trec_eval -c -m recall.100 -m map -m ndcg_cut.10 \ + msmarco-doc-dev runs/run.msmarco-doc.passage.tct_colbert-v2-hnp-maxp.trec + + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap -mndcg_cut.10 dl19-doc \ + runs/run.dl19-doc.passage.tct_colbert-v2-hnp-maxp.txt + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mmap -mndcg_cut.10 dl20-doc \ + runs/run.dl20-doc.passage.tct_colbert-v2-hnp-maxp.txt + +date diff --git a/bin/run-vector-prf.sh b/bin/run-vector-prf.sh new file mode 100755 index 000000000..3217dd051 --- /dev/null +++ b/bin/run-vector-prf.sh @@ -0,0 +1,696 @@ +#!/bin/sh + +date + +# DL19, ANCE + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.dl19.base + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.dl19.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.dl19.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.ance.dl19.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.ance.dl19.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.ance.dl19.rocchio_prf5_a0.4_b0.6.trec + +# DL19, TCT v1 + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.dl19.base + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.dl19.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.dl19.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.tct.dl19.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.tct.dl19.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.tct.dl19.rocchio_prf5_a0.4_b0.6.trec + +# DL19, TCT v2 + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.dl19.base + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.dl19.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.dl19.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.tct2.dl19.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.tct2.dl19.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.tct2.dl19.rocchio_prf5_a0.4_b0.6.trec + +# DL19, DistillBERT KD + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.dl19.base + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.dl19.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.dl19.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.distilbert-kd.dl19.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.distilbert-kd.dl19.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.distilbert-kd.dl19.rocchio_prf5_a0.4_b0.6.trec + +# DL19, DistillBERT TASB + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.dl19.base + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.dl19.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.dl19.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.distilbert-tasb.dl19.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.distilbert-tasb.dl19.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.distilbert-tasb.dl19.rocchio_prf5_a0.4_b0.6.trec + +# DL19, SBERT + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.dl19.base + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.dl19.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl19-passage \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.dl19.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.sbert.dl19.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.sbert.dl19.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl19-passage runs/run.sbert.dl19.rocchio_prf5_a0.4_b0.6.trec + +# DL20, ANCE + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.dl20.base + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.dl20.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.dl20.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.ance.dl20.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.ance.dl20.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.ance.dl20.rocchio_prf5_a0.4_b0.6.trec + +# DL20, TCT v1 + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.dl20.base + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.dl20.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.dl20.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.tct.dl20.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.tct.dl20.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.tct.dl20.rocchio_prf5_a0.4_b0.6.trec + +# DL20, TCT v2 + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.dl20.base + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.dl20.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.dl20.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.tct2.dl20.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.tct2.dl20.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.tct2.dl20.rocchio_prf5_a0.4_b0.6.trec + +# DL20, DistillBERT KD + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.dl20.base + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.dl20.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.dl20.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.distilbert-kd.dl20.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.distilbert-kd.dl20.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.distilbert-kd.dl20.rocchio_prf5_a0.4_b0.6.trec + +# DL20, DistillBERT TASB + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.dl20.base + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.dl20.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.dl20.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.distilbert-tasb.dl20.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.distilbert-tasb.dl20.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.distilbert-tasb.dl20.rocchio_prf5_a0.4_b0.6.trec + +# DL20, SBERT + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.dl20.base + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.dl20.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics dl20 \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.dl20.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.sbert.dl20.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.sbert.dl20.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.10,100 -m recall.1000 -l 2 dl20-passage runs/run.sbert.dl20.rocchio_prf5_a0.4_b0.6.trec + +# MS MARCO, ANCE + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.msmarco.base + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.msmarco.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-ance-bf \ + --encoder castorini/ance-msmarco-passage \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.ance.msmarco.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.ance.msmarco.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.ance.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.ance.msmarco.rocchio_prf5_a0.4_b0.6.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.ance.msmarco.base +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.ance.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.ance.msmarco.rocchio_prf5_a0.4_b0.6.trec + + +# MS MARCO, TCT v1 + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.msmarco.base + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.msmarco.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-tct_colbert-bf \ + --encoder castorini/tct_colbert-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct.msmarco.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.tct.msmarco.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.tct.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.tct.msmarco.rocchio_prf5_a0.4_b0.6.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.tct.msmarco.base +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.tct.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.tct.msmarco.rocchio_prf5_a0.4_b0.6.trec + +# MS MARCO, TCT v2 + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.msmarco.base + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.msmarco.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-tct_colbert-v2-hnp-bf \ + --encoder castorini/tct_colbert-v2-hnp-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.tct2.msmarco.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.tct2.msmarco.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.tct2.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.tct2.msmarco.rocchio_prf5_a0.4_b0.6.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.tct2.msmarco.base +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.tct2.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.tct2.msmarco.rocchio_prf5_a0.4_b0.6.trec + +# MS MARCO, DistillBERT KD + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.msmarco.base + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.msmarco.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-distilbert-dot-margin_mse-T2-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-kd.msmarco.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.distilbert-kd.msmarco.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.distilbert-kd.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.distilbert-kd.msmarco.rocchio_prf5_a0.4_b0.6.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.distilbert-kd.msmarco.base +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.distilbert-kd.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.distilbert-kd.msmarco.rocchio_prf5_a0.4_b0.6.trec + +# MS MARCO, DistillBERT TASB + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.msmarco.base + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.msmarco.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-distilbert-dot-tas_b-b256-bf \ + --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.distilbert-tasb.msmarco.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.distilbert-tasb.msmarco.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.distilbert-tasb.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.distilbert-tasb.msmarco.rocchio_prf5_a0.4_b0.6.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.distilbert-tasb.msmarco.base +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.distilbert-tasb.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.distilbert-tasb.msmarco.rocchio_prf5_a0.4_b0.6.trec + +# MS MARCO, SBERT + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.msmarco.base + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.msmarco.average_prf3.trec \ + --prf-depth 3 \ + --prf-method avg + +python -m pyserini.search.faiss \ + --topics msmarco-passage-dev-subset \ + --index msmarco-passage-sbert-bf \ + --encoder sentence-transformers/msmarco-distilbert-base-v3 \ + --batch-size 64 \ + --threads 12 \ + --output runs/run.sbert.msmarco.rocchio_prf5_a0.4_b0.6.trec \ + --prf-depth 5 \ + --prf-method rocchio \ + --rocchio-topk 5 \ + --rocchio-alpha 0.4 \ + --rocchio-beta 0.6 + +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.sbert.msmarco.base +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.sbert.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.100 -m recall.1000 msmarco-passage-dev-subset runs/run.sbert.msmarco.rocchio_prf5_a0.4_b0.6.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.sbert.msmarco.base +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.sbert.msmarco.average_prf3.trec +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset runs/run.sbert.msmarco.rocchio_prf5_a0.4_b0.6.trec + +date diff --git a/pyserini/encode/__init__.py b/pyserini/encode/__init__.py index 9ed24eb9e..0df8c7302 100644 --- a/pyserini/encode/__init__.py +++ b/pyserini/encode/__init__.py @@ -34,3 +34,34 @@ from ._tct_colbert import TctColBertDocumentEncoder, TctColBertQueryEncoder from ._tok_freq import TokFreqQueryEncoder from ._unicoil import UniCoilEncoder, UniCoilDocumentEncoder, UniCoilQueryEncoder + +document_encoder_class_map = { + "dpr": DprDocumentEncoder, + "tct_colbert": TctColBertDocumentEncoder, + "aggretriever": AggretrieverDocumentEncoder, + "ance": AnceDocumentEncoder, + "sentence-transformers": AutoDocumentEncoder, + "unicoil": UniCoilDocumentEncoder, + "openai-api": OpenAiDocumentEncoder, + "cosdpr": CosDprDocumentEncoder, + "auto": AutoDocumentEncoder, + "clip": ClipDocumentEncoder, + "contriever": AutoDocumentEncoder, + "arctic": ArcticDocumentEncoder, +} + +query_encoder_class_map = { + "dkrr": DkrrDprQueryEncoder, + "cosdpr": CosDprQueryEncoder, + "dpr": DprQueryEncoder, + "bpr": BprQueryEncoder, + "tct_colbert": TctColBertQueryEncoder, + "ance": AnceQueryEncoder, + "sentence": AutoQueryEncoder, + "contriever": AutoQueryEncoder, + "aggretriever": AggretrieverQueryEncoder, + "openai-api": OpenAiQueryEncoder, + "auto": AutoQueryEncoder, + "clip": ClipQueryEncoder, + "arctic": ArcticQueryEncoder, +} diff --git a/pyserini/encode/__main__.py b/pyserini/encode/__main__.py index 74895c240..3435fcb69 100644 --- a/pyserini/encode/__main__.py +++ b/pyserini/encode/__main__.py @@ -18,40 +18,24 @@ import sys from pyserini.encode import AutoDocumentEncoder -from pyserini.encode import ArcticDocumentEncoder, AggretrieverDocumentEncoder, AnceDocumentEncoder, \ - ClipDocumentEncoder, CosDprDocumentEncoder, DprDocumentEncoder, TctColBertDocumentEncoder, UniCoilDocumentEncoder -from pyserini.encode import OpenAiDocumentEncoder, OPENAI_API_RETRY_DELAY +from pyserini.encode import document_encoder_class_map +from pyserini.encode import OPENAI_API_RETRY_DELAY from pyserini.encode import JsonlRepresentationWriter, JsonlCollectionIterator from pyserini.encode.optional import FaissRepresentationWriter -encoder_class_map = { - "dpr": DprDocumentEncoder, - "tct_colbert": TctColBertDocumentEncoder, - "aggretriever": AggretrieverDocumentEncoder, - "ance": AnceDocumentEncoder, - "sentence-transformers": AutoDocumentEncoder, - "unicoil": UniCoilDocumentEncoder, - "openai-api": OpenAiDocumentEncoder, - "cosdpr": CosDprDocumentEncoder, - "auto": AutoDocumentEncoder, - "clip": ClipDocumentEncoder, - "contriever": AutoDocumentEncoder, - "arctic": ArcticDocumentEncoder, -} - def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal): _encoder_class = encoder_class # determine encoder_class if encoder_class is not None: - encoder_class = encoder_class_map[encoder_class] + encoder_class = document_encoder_class_map[encoder_class] else: # if any class keyword was matched in the given encoder name, # use that encoder class - for class_keyword in encoder_class_map: + for class_keyword in document_encoder_class_map: if class_keyword in encoder.lower(): - encoder_class = encoder_class_map[class_keyword] + encoder_class = document_encoder_class_map[class_keyword] break # if none of the class keyword was matched, @@ -62,13 +46,13 @@ def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multi # prepare arguments to encoder class kwargs = dict(model_name=encoder, device=device) - if (_encoder_class == "sentence-transformers") or ("sentence-transformers" in encoder): + if _encoder_class == 'sentence-transformers' or 'sentence-transformers' in encoder: kwargs.update(dict(pooling='mean', l2_norm=True)) - if (_encoder_class == "contriever") or ("contriever" in encoder): + if _encoder_class == 'contriever' or 'contriever' in encoder: kwargs.update(dict(pooling='mean', l2_norm=False)) - if (_encoder_class == "auto"): + if _encoder_class == 'auto': kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix)) - if (_encoder_class == "clip") or ("clip" in encoder): + if _encoder_class == 'clip' or 'clip' in encoder: kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal)) return encoder_class(**kwargs) diff --git a/pyserini/encode/_aggretriever.py b/pyserini/encode/_aggretriever.py index c16c510d6..2eb04b114 100644 --- a/pyserini/encode/_aggretriever.py +++ b/pyserini/encode/_aggretriever.py @@ -135,7 +135,8 @@ def __init__(self, model_name: str, tokenizer_name=None, device='cuda:0'): else: self.model = BertAggretrieverEncoder.from_pretrained(model_name) self.model.to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, + clean_up_tokenization_spaces=True) def encode(self, texts, titles=None, fp16=False, max_length=512, **kwargs): if titles is not None: @@ -170,7 +171,8 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, else: self.model = BertAggretrieverEncoder.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, + clean_up_tokenization_spaces=True) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') diff --git a/pyserini/encode/_ance.py b/pyserini/encode/_ance.py index f51f4e16d..0f63af050 100644 --- a/pyserini/encode/_ance.py +++ b/pyserini/encode/_ance.py @@ -26,8 +26,6 @@ class AnceEncoder(PreTrainedModel): config_class = RobertaConfig base_model_prefix = 'ance_encoder' load_tf_weights = None - _keys_to_ignore_on_load_missing = [r'position_ids'] - _keys_to_ignore_on_load_unexpected = [r'pooler', r'classifier'] def __init__(self, config: RobertaConfig): requires_backends(self, 'torch') @@ -77,7 +75,8 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): self.device = device self.model = AnceEncoder.from_pretrained(model_name) self.model.to(self.device) - self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or model_name) + self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or model_name, + clean_up_tokenization_spaces=True) def encode(self, texts, titles=None, max_length=256, **kwargs): if titles is not None: @@ -102,7 +101,8 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, self.device = device self.model = AnceEncoder.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or encoder_dir, + clean_up_tokenization_spaces=True) self.has_model = True self.tokenizer.do_lower_case = True if (not self.has_model) and (not self.has_encoded_query): diff --git a/pyserini/encode/_arctic.py b/pyserini/encode/_arctic.py index b94fc20aa..92aa6fc5a 100644 --- a/pyserini/encode/_arctic.py +++ b/pyserini/encode/_arctic.py @@ -20,12 +20,14 @@ from pyserini.encode import DocumentEncoder, QueryEncoder + class ArcticDocumentEncoder(DocumentEncoder): def __init__(self, model_name, device='cuda:0', truncate_to_256=False, tokenizer_name=None): # Truncate to output embedding to 256 for faster encoding self.device = device self.truncate_to_256 = truncate_to_256 self.model = AutoModel.from_pretrained(model_name, add_pooling_layer=False).to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(model_name or tokenizer_name) + self.tokenizer = AutoTokenizer.from_pretrained(model_name or tokenizer_name, + clean_up_tokenization_spaces=True) def encode(self, texts, max_length=512, **kwargs): document_tokens = self.tokenizer( @@ -56,7 +58,8 @@ def __init__(self, encoder_dir: str, query_prefix: str = 'Represent this sentenc self.device = device self.query_prefix = query_prefix self.model = AutoModel.from_pretrained(encoder_dir, add_pooling_layer=False).to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, + clean_up_tokenization_spaces=True) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): diff --git a/pyserini/encode/_auto.py b/pyserini/encode/_auto.py index 3d2e24251..209ad1985 100644 --- a/pyserini/encode/_auto.py +++ b/pyserini/encode/_auto.py @@ -27,9 +27,12 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cl self.model = AutoModel.from_pretrained(model_name) self.model.to(self.device) try: - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, + clean_up_tokenization_spaces=True) except: - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, use_fast=False) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, + use_fast=False, + clean_up_tokenization_spaces=True) self.has_model = True self.pooling = pooling self.l2_norm = l2_norm @@ -79,9 +82,12 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, self.model = AutoModel.from_pretrained(encoder_dir) self.model.to(self.device) try: - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, + clean_up_tokenization_spaces=True) except: - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, use_fast=False) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, + use_fast=False, + clean_up_tokenization_spaces=True) self.has_model = True self.pooling = pooling self.l2_norm = l2_norm diff --git a/pyserini/encode/_bpr.py b/pyserini/encode/_bpr.py index 978d71dbd..58195b6c7 100644 --- a/pyserini/encode/_bpr.py +++ b/pyserini/encode/_bpr.py @@ -37,7 +37,8 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, self.device = device self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir, + clean_up_tokenization_spaces=True) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): diff --git a/pyserini/encode/_clip.py b/pyserini/encode/_clip.py index 279277ac3..b75e4082b 100644 --- a/pyserini/encode/_clip.py +++ b/pyserini/encode/_clip.py @@ -52,7 +52,7 @@ class BaseClipEncoder: def __init__(self, model_name, device='cuda:0', l2_norm=True): self.device = device self.model = CLIPModel.from_pretrained(model_name).to(device) - self.processor = CLIPProcessor.from_pretrained(model_name) + self.processor = CLIPProcessor.from_pretrained(model_name, clean_up_tokenization_spaces=True) self.l2_norm = l2_norm def normalize_embeddings(self, embeddings): @@ -96,11 +96,11 @@ def encode(self, texts, max_length=77, **kwargs): texts = [texts] if self.prefix: - texts = [f"{self.prefix} {text}" for text in texts] + texts = [f'{self.prefix} {text}' for text in texts] inputs = self.processor( text=texts, - return_tensors="pt", + return_tensors='pt', padding='max_length', max_length=max_length, truncation='longest_first', diff --git a/pyserini/encode/_cosdpr.py b/pyserini/encode/_cosdpr.py index de2b6cf1f..624ba4164 100644 --- a/pyserini/encode/_cosdpr.py +++ b/pyserini/encode/_cosdpr.py @@ -26,8 +26,6 @@ class CosDprEncoder(PreTrainedModel): config_class = BertConfig base_model_prefix = 'bert' load_tf_weights = None - #_keys_to_ignore_on_load_missing = [r'position_ids'] - #_keys_to_ignore_on_load_unexpected = [r'pooler', r'classifier'] def __init__(self, config: BertConfig): super().__init__(config) @@ -77,7 +75,8 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): self.device = device self.model = CosDprEncoder.from_pretrained(model_name) self.model.to(self.device) - self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name, + clean_up_tokenization_spaces=True) def encode(self, texts, titles=None, max_length=256, **kwargs): if titles is not None: @@ -100,7 +99,8 @@ def __init__(self, encoder_dir: str, tokenizer_name: str = None, device: str = ' self.device = device self.model = CosDprEncoder.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = BertTokenizer.from_pretrained(encoder_dir or tokenizer_name) + self.tokenizer = BertTokenizer.from_pretrained(encoder_dir or tokenizer_name, + clean_up_tokenization_spaces=True) def encode(self, query: str, **kwargs): inputs = self.tokenizer( diff --git a/pyserini/encode/_dkrr.py b/pyserini/encode/_dkrr.py index 1bf82707e..84b7f7e9b 100644 --- a/pyserini/encode/_dkrr.py +++ b/pyserini/encode/_dkrr.py @@ -28,7 +28,8 @@ def __init__(self, encoder_dir: str = None, encoded_query_dir: str = None, devic self.device = device self.model = BertModel.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', + clean_up_tokenization_spaces=True) self.has_model = True self.prefix = prefix diff --git a/pyserini/encode/_dpr.py b/pyserini/encode/_dpr.py index 9d4930897..613361e91 100644 --- a/pyserini/encode/_dpr.py +++ b/pyserini/encode/_dpr.py @@ -15,16 +15,36 @@ # from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer +from transformers.utils import logging from pyserini.encode import DocumentEncoder, QueryEncoder +# See https://github.com/huggingface/transformers/issues/5421 +# about suprressing warning "Some weights of the model checkpoint ... were not used when initializing" +class log_level: + orig_log_level: int + log_level: int + + def __init__(self, log_level: int): + self.log_level = log_level + self.orig_log_level = logging.get_verbosity() + + def __enter__(self): + logging.set_verbosity(self.log_level) + + def __exit__(self, exception_type, exception_value, traceback): + logging.set_verbosity(self.orig_log_level) + + class DprDocumentEncoder(DocumentEncoder): def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): self.device = device - self.model = DPRContextEncoder.from_pretrained(model_name) + with log_level(logging.ERROR): + self.model = DPRContextEncoder.from_pretrained(model_name) self.model.to(self.device) - self.tokenizer = DPRContextEncoderTokenizer.from_pretrained(tokenizer_name or model_name) + self.tokenizer = DPRContextEncoderTokenizer.from_pretrained(tokenizer_name or model_name, + clean_up_tokenization_spaces=True) def encode(self, texts, titles=None, max_length=256, **kwargs): if titles: @@ -56,9 +76,10 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, super().__init__(encoded_query_dir) if encoder_dir: self.device = device - self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) + with log_level(logging.ERROR): + self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir, clean_up_tokenization_spaces=True) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') diff --git a/pyserini/encode/_slim.py b/pyserini/encode/_slim.py index 213f41be9..1b7d35cc5 100644 --- a/pyserini/encode/_slim.py +++ b/pyserini/encode/_slim.py @@ -27,7 +27,8 @@ def __init__(self, model_name_or_path, tokenizer_name=None, fusion_weight=.99, d self.fusion_weight = fusion_weight self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path) self.model.to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path, + clean_up_tokenization_spaces=True) self.reverse_vocab = {v: k for k, v in self.tokenizer.vocab.items()} self.weight_range = 5 self.quant_range = 256 diff --git a/pyserini/encode/_splade.py b/pyserini/encode/_splade.py index a156ceb73..09fe1a3bb 100644 --- a/pyserini/encode/_splade.py +++ b/pyserini/encode/_splade.py @@ -26,7 +26,8 @@ def __init__(self, model_name_or_path, tokenizer_name=None, device='cpu'): self.device = device self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path) self.model.to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path, + clean_up_tokenization_spaces=True) self.reverse_voc = {v: k for k, v in self.tokenizer.vocab.items()} self.weight_range = 5 self.quant_range = 256 diff --git a/pyserini/encode/_tct_colbert.py b/pyserini/encode/_tct_colbert.py index 1f2d16549..d56efab68 100644 --- a/pyserini/encode/_tct_colbert.py +++ b/pyserini/encode/_tct_colbert.py @@ -34,11 +34,13 @@ def __init__(self, model_name: str, tokenizer_name=None, device='cuda:0'): options = SessionOptions() self.session = InferenceSession(model_name, options) self.onnx = True - self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name[:-5]) + self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name[:-5], + clean_up_tokenization_spaces=True) else: self.model = BertModel.from_pretrained(model_name) self.model.to(self.device) - self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name) + self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name, + clean_up_tokenization_spaces=True) def encode(self, texts, titles=None, fp16=False, max_length=512, **kwargs): if titles is not None: @@ -79,7 +81,8 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, self.device = device self.model = BertModel.from_pretrained(encoder_dir) self.model.to(self.device) - self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or encoder_dir, + clean_up_tokenization_spaces=True) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') diff --git a/pyserini/encode/_tok_freq.py b/pyserini/encode/_tok_freq.py index 3574c3412..81ceb026f 100644 --- a/pyserini/encode/_tok_freq.py +++ b/pyserini/encode/_tok_freq.py @@ -21,7 +21,8 @@ class TokFreqQueryEncoder(QueryEncoder): def __init__(self, model_name_or_path=None): - self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if model_name_or_path else None + self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, clean_up_tokenization_spaces=True) \ + if model_name_or_path else None def encode(self, text, **kwargs): vector = {} diff --git a/pyserini/encode/_unicoil.py b/pyserini/encode/_unicoil.py index 94abde56e..8952515fb 100644 --- a/pyserini/encode/_unicoil.py +++ b/pyserini/encode/_unicoil.py @@ -79,7 +79,7 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): self.device = device self.model = UniCoilEncoder.from_pretrained(model_name) self.model.to(self.device) - self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name, clean_up_tokenization_spaces=True) def encode(self, texts, titles=None, expands=None, fp16=False, max_length=512, **kwargs): if titles: diff --git a/pyserini/search/faiss/__main__.py b/pyserini/search/faiss/__main__.py index f56bbf295..9056069eb 100644 --- a/pyserini/search/faiss/__main__.py +++ b/pyserini/search/faiss/__main__.py @@ -21,8 +21,8 @@ from tqdm import tqdm from pyserini.encode import QueryEncoder, AutoQueryEncoder -from pyserini.encode import ArcticQueryEncoder, AggretrieverQueryEncoder, AnceQueryEncoder, BprQueryEncoder, \ - CosDprQueryEncoder, DkrrDprQueryEncoder, DprQueryEncoder, OpenAiQueryEncoder, ClipQueryEncoder,TctColBertQueryEncoder +from pyserini.encode import AnceQueryEncoder, BprQueryEncoder +from pyserini.encode import query_encoder_class_map from pyserini.encode.optional import PcaEncoder from pyserini.output_writer import get_output_writer, OutputFormat from pyserini.query_iterator import get_query_iterator, TopicsFormat @@ -101,34 +101,19 @@ def init_query_encoder(encoder, encoder_class, tokenizer_name, topics_name, enco 'dpr-squad-test': 'dpr_multi-squad-test', 'dpr-curated-test': 'dpr_multi-curated-test' } - encoder_class_map = { - "dkrr": DkrrDprQueryEncoder, - "cosdpr": CosDprQueryEncoder, - "dpr": DprQueryEncoder, - "bpr": BprQueryEncoder, - "tct_colbert": TctColBertQueryEncoder, - "ance": AnceQueryEncoder, - "sentence": AutoQueryEncoder, - "contriever": AutoQueryEncoder, - "aggretriever": AggretrieverQueryEncoder, - "openai-api": OpenAiQueryEncoder, - "auto": AutoQueryEncoder, - "clip": ClipQueryEncoder, - "arctic": ArcticQueryEncoder, - } if encoder: _encoder_class = encoder_class # determine encoder_class if encoder_class is not None: - encoder_class = encoder_class_map[encoder_class] + encoder_class = query_encoder_class_map[encoder_class] else: # if any class keyword was matched in the given encoder name, # use that encoder class - for class_keyword in encoder_class_map: + for class_keyword in query_encoder_class_map: if class_keyword in encoder.lower(): - encoder_class = encoder_class_map[class_keyword] + encoder_class = query_encoder_class_map[class_keyword] break # if none of the class keyword was matched, diff --git a/pyserini/search/faiss/_searcher.py b/pyserini/search/faiss/_searcher.py index 7e895f2ea..45db63fd3 100644 --- a/pyserini/search/faiss/_searcher.py +++ b/pyserini/search/faiss/_searcher.py @@ -27,12 +27,12 @@ import numpy as np from transformers.file_utils import requires_backends -from pyserini.encode import AnceQueryEncoder, BprQueryEncoder, DprQueryEncoder, TctColBertQueryEncoder from pyserini.encode import QueryEncoder, AutoQueryEncoder +from pyserini.encode import AnceQueryEncoder, BprQueryEncoder, DprQueryEncoder, TctColBertQueryEncoder from pyserini.index import Document -from pyserini.search.faiss._prf import PrfDenseSearchResult from pyserini.search.lucene import LuceneSearcher from pyserini.util import download_prebuilt_index, get_dense_indexes_info, get_sparse_index +from ._prf import PrfDenseSearchResult @dataclass diff --git a/pyserini/search/lucene/__main__.py b/pyserini/search/lucene/__main__.py index ff68531e0..666d59e19 100644 --- a/pyserini/search/lucene/__main__.py +++ b/pyserini/search/lucene/__main__.py @@ -258,7 +258,8 @@ def define_search_args(parser): analyzer = JWhiteSpaceAnalyzer() searcher.set_analyzer(analyzer) print(f'Using whitespace analyzer because of pretokenized topics') - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, + clean_up_tokenization_spaces=True) print(f'Using {args.tokenizer} to preprocess topics') if args.stopwords: diff --git a/pyserini/search/lucene/irst/__main__.py b/pyserini/search/lucene/irst/__main__.py index c294a2ca8..c21ef2923 100644 --- a/pyserini/search/lucene/irst/__main__.py +++ b/pyserini/search/lucene/irst/__main__.py @@ -34,7 +34,7 @@ def normalize(scores: List[float]): def query_loader(topic: str): queries = {} - bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True) topics_dic = get_topics(topic) line_num = 0 for topic_id in topics_dic: @@ -42,12 +42,12 @@ def query_loader(topic: str): query_text = topics_dic[topic_id]['title'] text_bert_tok = bert_tokenizer.tokenize(query_text.lower()) if len(text_bert_tok) >= 0: - query = {"raw": query_text, - "contents": ' '.join(text_bert_tok)} + query = {'raw': query_text, + 'contents': ' '.join(text_bert_tok)} queries[topic_id] = query if line_num % 10000 == 0: - print(f"Processed {line_num} queries") - print(f"Processed {line_num} queries") + print(f'Processed {line_num} queries') + print(f'Processed {line_num} queries') return queries diff --git a/pyserini/search/lucene/irst/_searcher.py b/pyserini/search/lucene/irst/_searcher.py index 299647626..7c6d96055 100644 --- a/pyserini/search/lucene/irst/_searcher.py +++ b/pyserini/search/lucene/irst/_searcher.py @@ -70,10 +70,11 @@ def __init__(self, index: str, k1: int, b: int, num_threads: int): TF_INDEX_INFO['msmarco-v1-doc-segmented']['filename'][:-6] + TF_INDEX_INFO['msmarco-v1-doc-segmented']['md5']) else: - print("We currently only support three indexes: msmarco-passage, msmarco-v1-doc and msmarco-v1-doc-segmented but the index you inserted is not one of those") + print('We currently only support three indexes: msmarco-passage, msmarco-v1-doc and msmarco-v1-doc-segmented but the index you inserted is not one of those') self.object = JLuceneSearcher(index_path) self.source_lookup, self.target_lookup, self.tran = self.load_tranprobs_table() - self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', + clean_up_tokenization_spaces=True) self.pool = ThreadPool(num_threads) diff --git a/pyserini/search/lucene/ltr/__main__.py b/pyserini/search/lucene/ltr/__main__.py index 462407c93..62a2f3953 100644 --- a/pyserini/search/lucene/ltr/__main__.py +++ b/pyserini/search/lucene/ltr/__main__.py @@ -113,7 +113,8 @@ def query_loader(topic): queries = {} nlp = SpacyTextParser('en_core_web_sm', keep_only_alpha_num=True, lower_case=True) analyzer = Analyzer(get_lucene_analyzer()) - bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', + clean_up_tokenization_spaces=True) inp_file = open(topic) ln = 0 for line in tqdm(inp_file): diff --git a/pyserini/tokenize_json_collection.py b/pyserini/tokenize_json_collection.py index 7824d21dd..522f90015 100644 --- a/pyserini/tokenize_json_collection.py +++ b/pyserini/tokenize_json_collection.py @@ -38,9 +38,9 @@ def write_to_file(tokenizer, input, output): def main(args): if ('bert' in args.tokenizer): - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True) else: - tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco', legacy=True) + tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco', legacy=True, clean_up_tokenization_spaces=True) if (os.path.isdir(args.input)): for i, inf in enumerate(sorted(os.listdir(args.input))): if not os.path.isdir(args.output): diff --git a/scripts/jobs.docs-all.txt b/scripts/jobs.docs-all.txt new file mode 100644 index 000000000..e60b219a7 --- /dev/null +++ b/scripts/jobs.docs-all.txt @@ -0,0 +1,12 @@ +bin/run-ance.sh > logs/log.ance 2>&1 +bin/run-bpr.sh > logs/log.bpr 2>&1 +bin/run-distillbert-kd.sh > logs/log.distillbert-kd 2>&1 +bin/run-distillbert-tasb.sh > logs/log.distillbert-tasb 2>&1 +bin/run-dpr-encoded.sh > logs/log.dpr-encoded 2>&1 +bin/run-dkrr.sh > logs/log.dkrr 2>&1 +bin/run-sbert.sh > logs/log.sbert 2>&1 +bin/run-tct-encoded.sh > logs/log.tct-encoded 2>&1 +bin/run-tct2-encoded.sh > logs/log.tct2-encoded 2>&1 +bin/run-dpr-otf.sh > logs/log.dpr-otf 2>&1 +bin/run-tct-otf.sh > logs/log.tct-otf 2>&1 +bin/run-tct2-otf.sh > logs/log.tct2-otf 2>&1 diff --git a/tests-optional/test_encoder.py b/tests-optional/test_encode.py similarity index 94% rename from tests-optional/test_encoder.py rename to tests-optional/test_encode.py index 57e8903c2..5e2c0e01b 100644 --- a/tests-optional/test_encoder.py +++ b/tests-optional/test_encode.py @@ -58,7 +58,7 @@ def assertIsFile(path): if not pl.Path(path).resolve().is_file(): raise AssertionError("File does not exist: %s" % str(path)) - def test_tct_colbert_v2_encoder_cmd(self): + def test_tct_colbert_v2_encode_cmd(self): index_dir = 'temp_index' cmd = f'python -m pyserini.encode \ input --corpus {self.test_file} \ @@ -90,7 +90,7 @@ def test_tct_colbert_v2_encoder_cmd(self): shutil.rmtree(index_dir) - def test_tct_colbert_v2_encoder_cmd_shard(self): + def test_tct_colbert_v2_encode_cmd_shard(self): cleanup_list = [] for shard_i in range(2): index_dir = f'temp_index-{shard_i}' @@ -136,7 +136,7 @@ def test_tct_colbert_v2_encoder_cmd_shard(self): for index_dir in cleanup_list: shutil.rmtree(index_dir) - def test_aggretriever_distilbert_encoder_cmd(self): + def test_aggretriever_distilbert_encode_cmd(self): index_dir = 'temp_index' cmd = f'python -m pyserini.encode \ input --corpus {self.test_file} \ @@ -167,7 +167,7 @@ def test_aggretriever_distilbert_encoder_cmd(self): shutil.rmtree(index_dir) - def test_aggretriever_cocondenser_encoder_cmd(self): + def test_aggretriever_cocondenser_encode_cmd(self): index_dir = 'temp_index' cmd = f'python -m pyserini.encode \ input --corpus {self.test_file} \ @@ -199,27 +199,27 @@ def test_aggretriever_cocondenser_encoder_cmd(self): shutil.rmtree(index_dir) def test_onnx_encode_unicoil(self): - temp_object = LuceneImpactSearcher(f'{self.index_dir}lucene9-index.cacm', 'SpladePlusPlusEnsembleDistil', encoder_type='onnx') + searcher1 = LuceneImpactSearcher(f'{self.index_dir}lucene9-index.cacm', 'SpladePlusPlusEnsembleDistil', encoder_type='onnx') # this function will never be called in _impact_searcher, here to check quantization correctness - results = temp_object.encode("here is a test") + results = searcher1.encode("here is a test") self.assertEqual(results.get("here"), 156) self.assertEqual(results.get("a"), 31) self.assertEqual(results.get("test"), 149) - temp_object.close() - del temp_object + searcher1.close() + del searcher1 - temp_object1 = LuceneImpactSearcher(f'{self.index_dir}lucene9-index.cacm', 'naver/splade-cocondenser-ensembledistil') + searcher2 = LuceneImpactSearcher(f'{self.index_dir}lucene9-index.cacm', 'naver/splade-cocondenser-ensembledistil') # this function will never be called in _impact_searcher, here to check quantization correctness - results = temp_object1.encode("here is a test") + results = searcher2.encode("here is a test") self.assertEqual(results.get("here"), 156) self.assertEqual(results.get("a"), 31) self.assertEqual(results.get("test"), 149) - temp_object1.close() - del temp_object1 + searcher2.close() + del searcher2 def test_clip_encoder_cmd_text(self): index_dir = 'temp_index' diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py index 9fd57bfbe..14a638504 100644 --- a/tests/test_tokenization.py +++ b/tests/test_tokenization.py @@ -27,13 +27,13 @@ def setUp(self): def test_bert_base_uncased_demo(self): # https://huggingface.co/transformers/tokenizer_summary.html - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('I have a new GPU!') self.assertEqual(['i', 'have', 'a', 'new', 'gp', '##u', '!'], tokens) def test_bert_base_uncased_en_book_examples(self): # These are examples used in the ptr4tr book - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling') self.assertEqual(['walking', 'talking', 'bal', '##king', 'biking', 'hiking', 'rolling', 'scrolling'], tokens) @@ -44,7 +44,7 @@ def test_bert_base_uncased_en_book_examples(self): tokens = tokenizer.tokenize('adversarial') self.assertEqual(['ad', '##vers', '##aria', '##l'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('walking talking balking biking hiking') self.assertEqual(['walking', 'talking', 'b', '##alk', '##ing', 'bi', '##king', 'hiking'], tokens) @@ -59,7 +59,7 @@ def test_bert_base_uncased_en_book_examples(self): def test_xlm_roberta_base_en_book_examples(self): # These are examples used in the ptr4tr book - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling') self.assertEqual(['▁walking', '▁talking', '▁bal', 'king', '▁bi', 'king', '▁hi', 'king', '▁roll', 'ing', '▁scroll', 'ing'], tokens) @@ -74,7 +74,7 @@ def test_xlm_roberta_base_en_book_examples(self): self.assertEqual(['▁adversari', 'al'], tokens) def test_bert_base_multilingual_en_book_examples(self): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling') self.assertEqual(['walking', 'talking', 'bal', '##king', 'bi', '##king', 'hi', '##king', 'rolling', 'sc', '##roll', '##ing'], tokens) @@ -88,7 +88,7 @@ def test_bert_base_multilingual_en_book_examples(self): tokens = tokenizer.tokenize('adversarial') self.assertEqual(['ad', '##versari', '##al'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('walking talking balking biking hiking') self.assertEqual(['walking', 'talking', 'bal', '##king', 'bi', '##king', 'hi', '##king'], tokens) @@ -117,7 +117,7 @@ def test_lucene_analyzer_en_book_examples(self): self.assertEqual(['adversari'], tokens) def test_bert_base_multilingual_fr_book_examples(self): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('marche parler vélo randonnée rouler défilement') @@ -135,7 +135,7 @@ def test_bert_base_multilingual_fr_book_examples(self): tokens = tokenizer.tokenize('antagoniste') self.assertEqual(['ant', '##ago', '##niste'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('marche parler vélo randonnée rouler défilement') @@ -169,7 +169,7 @@ def test_lucene_analyzer_fr_book_examples(self): self.assertEqual(['antagonist'], tokens) def test_bert_base_multilingual_zh_book_examples(self): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('走路说话骑自行车远足滚动滚动') @@ -187,7 +187,7 @@ def test_bert_base_multilingual_zh_book_examples(self): tokens = tokenizer.tokenize('对抗的') self.assertEqual(['对', '抗', '的'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('走路说话骑自行车远足滚动滚动') @@ -221,7 +221,7 @@ def test_lucene_analyzer_zh_book_examples(self): self.assertEqual(['对抗', '抗的'], tokens) def test_bert_base_multilingual_ar_book_examples(self): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('المشي الحديث ركوب الدراجات المشي لمسافات طويلة المتداول التمرير') @@ -239,7 +239,7 @@ def test_bert_base_multilingual_ar_book_examples(self): tokens = tokenizer.tokenize('عدائي') self.assertEqual(['ع', '##دا', '##يي'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('المشي الحديث ركوب الدراجات المشي لمسافات طويلة المتداول التمرير') @@ -258,7 +258,7 @@ def test_bert_base_multilingual_ar_book_examples(self): self.assertEqual(['ع', '##دا', '##ئي'], tokens) def test_bert_base_multilingual_hi_book_examples(self): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('चलने की बात करते हुए बाइक चलाना लंबी पैदल यात्रा स्क्रॉल') @@ -276,7 +276,7 @@ def test_bert_base_multilingual_hi_book_examples(self): tokens = tokenizer.tokenize('विरोधात्मक') self.assertEqual(['वि', '##रो', '##धा', '##तमक'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('चलने की बात करते हुए बाइक चलाना लंबी पैदल यात्रा स्क्रॉल') @@ -295,7 +295,7 @@ def test_bert_base_multilingual_hi_book_examples(self): self.assertEqual(['वि', '##रो', '##धा', '##त्मक'], tokens) def test_bert_base_multilingual_bn_book_examples(self): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('হাঁটাচলা বাইকিং হাইকিং রোলিং স্ক্রোলিং') @@ -313,7 +313,7 @@ def test_bert_base_multilingual_bn_book_examples(self): tokens = tokenizer.tokenize('প্রতিকূল') self.assertEqual(['পরতি', '##ক', '##ল'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) # walking talking biking hiking rolling scrolling tokens = tokenizer.tokenize('হাঁটাচলা বাইকিং হাইকিং রোলিং স্ক্রোলিং') @@ -330,12 +330,12 @@ def test_bert_base_multilingual_bn_book_examples(self): # adversarial tokens = tokenizer.tokenize('প্রতিকূল') self.assertEqual(['প্রতি', '##ক', '##ূ', '##ল'], tokens) - + def test_bert_base_multilingual_am(self): """ amharic """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('የሽፋኑ ርዕሰ ጉዳይ የሞቱ ሰዎች ይነሳሉ') self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens) @@ -343,31 +343,31 @@ def test_bert_base_multilingual_am(self): tokens = tokenizer.tokenize('የሽፋኑ') self.assertEqual(['[UNK]'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('የሽፋኑ ርዕሰ ጉዳይ የሞቱ ሰዎች ይነሳሉ') self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens) tokens = tokenizer.tokenize('የሽፋኑ') self.assertEqual(['[UNK]'], tokens) - + def test_xlmr_base_multilingual_am(self): """ amharic """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('የሽፋኑ ርዕሰ ጉዳይ የሞቱ ሰዎች ይነሳሉ') self.assertEqual(['▁የ', 'ሽ', 'ፋ', 'ኑ', '▁ርዕሰ', '▁ጉዳይ', '▁የ', 'ሞቱ', '▁ሰዎች', '▁ይ', 'ነሳ', 'ሉ'], tokens) tokens = tokenizer.tokenize('የሽፋኑ') self.assertEqual(['▁የ', 'ሽ', 'ፋ', 'ኑ'], tokens) - + def test_bert_base_multilingual_ha(self): """ hausa """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Ya san kungiyar, ya san komai game da kungiyar') self.assertEqual(['ya', 'san', 'kung', '##iya', '##r', ',', 'ya', 'san', 'koma', '##i', 'game', 'da', 'kung', '##iya', '##r'], tokens) @@ -375,19 +375,19 @@ def test_bert_base_multilingual_ha(self): tokens = tokenizer.tokenize('kungiyar') self.assertEqual(['kung', '##iya', '##r'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Ya san kungiyar, ya san komai game da kungiyar') self.assertEqual(['Ya', 'san', 'kung', '##iya', '##r', ',', 'ya', 'san', 'koma', '##i', 'game', 'da', 'kung', '##iya', '##r'], tokens) tokens = tokenizer.tokenize('kungiyar') self.assertEqual(['kung', '##iya', '##r'], tokens) - + def test_xlmr_base_multilingual_ha(self): """ hausa """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Ya san kungiyar, ya san komai game da kungiyar') self.assertEqual(['▁Ya', '▁san', '▁kungiyar', ',', '▁ya', '▁san', '▁koma', 'i', '▁game', '▁da', '▁kungiyar'], tokens) @@ -399,7 +399,7 @@ def test_bert_base_multilingual_ig(self): """ igbo """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Oke Ọñụ Adaa Dịka Lọọlọ Ezenneka gbàrà Ahọ Otu Narị') self.assertEqual(['ok', '##e', 'onu', 'ada', '##a', 'dik', '##a', 'lo', '##olo', 'ezen', '##nek', '##a', 'gba', '##ra', 'ah', '##o', 'ot', '##u', 'nar', '##i'], tokens) @@ -407,19 +407,19 @@ def test_bert_base_multilingual_ig(self): tokens = tokenizer.tokenize('Ezenneka') self.assertEqual(['ezen', '##nek', '##a'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Oke Ọñụ Adaa Dịka Lọọlọ Ezenneka gbàrà Ahọ Otu Narị') self.assertEqual(['Ok', '##e', 'Ọ', '##ñ', '##ụ', 'Ada', '##a', 'D', '##ị', '##ka', 'L', '##ọ', '##ọ', '##l', '##ọ', 'Ezen', '##nek', '##a', 'g', '##bà', '##rà', 'Ah', '##ọ', 'O', '##tu', 'Na', '##r', '##ị'], tokens) tokens = tokenizer.tokenize('Ezenneka') self.assertEqual(['Ezen', '##nek', '##a'], tokens) - + def test_xlmr_base_multilingual_ig(self): """ igbo """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Oke Ọñụ Adaa Dịka Lọọlọ Ezenneka gbàrà Ahọ Otu Narị') self.assertEqual(['▁O', 'ke', '▁', 'Ọ', 'ñ', 'ụ', '▁Ada', 'a', '▁D', 'ị', 'ka', '▁L', 'ọ', 'ọ', 'l', 'ọ', '▁Ezen', 'nek', 'a', '▁', 'gb', 'à', 'rà', '▁Ah', 'ọ', '▁O', 'tu', '▁Nar', 'ị'], tokens) @@ -431,7 +431,7 @@ def test_bert_base_multilingual_om(self): """ Afaan Oromoo """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Ani obbolaa keessan, Abdii Baalee Oromiyaatii') self.assertEqual(['ani', 'ob', '##bola', '##a', 'ke', '##essa', '##n', ',', 'abd', '##ii', 'ba', '##ale', '##e', 'oro', '##mi', '##ya', '##atii'], tokens) @@ -439,19 +439,19 @@ def test_bert_base_multilingual_om(self): tokens = tokenizer.tokenize('Oromiyaatii') self.assertEqual(['oro', '##mi', '##ya', '##atii'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Ani obbolaa keessan, Abdii Baalee Oromiyaatii') self.assertEqual(['Ani', 'ob', '##bola', '##a', 'ke', '##essa', '##n', ',', 'Abd', '##ii', 'Ba', '##ale', '##e', 'Oro', '##mi', '##ya', '##ati', '##i'], tokens) tokens = tokenizer.tokenize('Oromiyaatii') self.assertEqual(['Oro', '##mi', '##ya', '##ati', '##i'], tokens) - + def test_xlmr_base_multilingual_om(self): """ Afaan Oromoo """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Ani obbolaa keessan, Abdii Baalee Oromiyaatii') self.assertEqual(['▁Ani', '▁ob', 'bola', 'a', '▁keessa', 'n', ',', '▁Ab', 'dii', '▁Ba', 'ale', 'e', '▁Oromiyaa', 'tii'], tokens) @@ -463,7 +463,7 @@ def test_bert_base_multilingual_pcm(self): """ Nigerian Pidgin """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Crude oil dey kill pickin for Nigeria?') self.assertEqual(['cru', '##de', 'oil', 'de', '##y', 'kill', 'pick', '##in', 'for', 'nigeria', '?'], tokens) @@ -471,19 +471,19 @@ def test_bert_base_multilingual_pcm(self): tokens = tokenizer.tokenize('wahala') self.assertEqual(['wah', '##ala'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Crude oil dey kill pickin for Nigeria?') self.assertEqual(['C', '##rude', 'oil', 'de', '##y', 'kill', 'pick', '##in', 'for', 'Nigeria', '?'], tokens) tokens = tokenizer.tokenize('wahala') self.assertEqual(['wa', '##hala'], tokens) - + def test_xlmr_base_multilingual_pcm(self): """ Nigerian Pidgin """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Crude oil dey kill pickin for Nigeria?') self.assertEqual(['▁Cru', 'de', '▁oil', '▁de', 'y', '▁kill', '▁pick', 'in', '▁for', '▁Nigeria', '?'], tokens) @@ -495,7 +495,7 @@ def test_bert_base_multilingual_so(self): """ Somali """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Rabbigu wuxuu amar ku bixiyey in la dumiyo qalcadaha Kancaan.') self.assertEqual(['rabbi', '##gu', 'wu', '##xu', '##u', 'amar', 'ku', 'bi', '##xi', '##ye', '##y', 'in', 'la', 'dum', '##iy', '##o', 'qal', '##cada', '##ha', 'kan', '##ca', '##an', '.'], tokens) @@ -503,19 +503,19 @@ def test_bert_base_multilingual_so(self): tokens = tokenizer.tokenize('bixiyey') self.assertEqual(['bi', '##xi', '##ye', '##y'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Rabbigu wuxuu amar ku bixiyey in la dumiyo qalcadaha Kancaan.') self.assertEqual(['Rabbi', '##gu', 'w', '##ux', '##uu', 'amar', 'ku', 'bi', '##xi', '##ye', '##y', 'in', 'la', 'dum', '##iyo', 'q', '##al', '##cada', '##ha', 'Kan', '##ca', '##an', '.'], tokens) tokens = tokenizer.tokenize('bixiyey') self.assertEqual(['bi', '##xi', '##ye', '##y'], tokens) - + def test_xlmr_base_multilingual_so(self): """ Somali """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Rabbigu wuxuu amar ku bixiyey in la dumiyo qalcadaha Kancaan.') self.assertEqual(['▁Rabbi', 'gu', '▁wuxuu', '▁amar', '▁ku', '▁bixi', 'yey', '▁in', '▁la', '▁dum', 'iyo', '▁qal', 'cada', 'ha', '▁Kan', 'ca', 'an', '.'], tokens) @@ -527,7 +527,7 @@ def test_bert_base_multilingual_sw(self): """ Swahili """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Huduma ya upasuaji mkubwa na mdogo') self.assertEqual(['hu', '##dum', '##a', 'ya', 'up', '##asu', '##aji', 'mk', '##ubwa', 'na', 'md', '##ogo'], tokens) @@ -535,19 +535,19 @@ def test_bert_base_multilingual_sw(self): tokens = tokenizer.tokenize('upasuaji') self.assertEqual(['up', '##asu', '##aji'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Huduma ya upasuaji mkubwa na mdogo') self.assertEqual(['Hu', '##dum', '##a', 'ya', 'up', '##asu', '##aji', 'mk', '##ub', '##wa', 'na', 'm', '##dogo'], tokens) tokens = tokenizer.tokenize('upasuaji') self.assertEqual(['up', '##asu', '##aji'], tokens) - + def test_xlmr_base_multilingual_sw(self): """ Swahili """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Huduma ya upasuaji mkubwa na mdogo') self.assertEqual(['▁Huduma', '▁ya', '▁up', 'asu', 'aji', '▁mkubwa', '▁na', '▁mdogo'], tokens) @@ -559,7 +559,7 @@ def test_bert_base_multilingual_ti(self): """ Tigrinya """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('ስርዓተ ቀብሪ ኢንጂነር ስመኘው በቀለ ትማሊ ተፈፂሙ') self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens) @@ -567,19 +567,19 @@ def test_bert_base_multilingual_ti(self): tokens = tokenizer.tokenize('ኢንጂነር') self.assertEqual(['[UNK]'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('ስርዓተ ቀብሪ ኢንጂነር ስመኘው በቀለ ትማሊ ተፈፂሙ') self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens) tokens = tokenizer.tokenize('ኢንጂነር') self.assertEqual(['[UNK]'], tokens) - + def test_xlmr_base_multilingual_ti(self): """ Tigrinya """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('ስርዓተ ቀብሪ ኢንጂነር ስመኘው በቀለ ትማሊ ተፈፂሙ') self.assertEqual(['▁ስር', 'ዓ', 'ተ', '▁ቀ', 'ብሪ', '▁ኢን', 'ጂ', 'ነ', 'ር', '▁ስ', 'መ', 'ኘ', 'ው', '▁በቀለ', '▁ት', 'ማ', 'ሊ', '▁ተፈ', 'ፂ', 'ሙ'], tokens) @@ -591,7 +591,7 @@ def test_bert_base_multilingual_yo(self): """ Yoruba """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Orúkọ ọmọbinrin rẹ̀ àgbà ni Merabu, ti èyí àbúrò ni Mikali.') self.assertEqual(['oru', '##ko', 'omo', '##bin', '##rin', 're', 'ag', '##ba', 'ni', 'mera', '##bu', ',', 'ti', 'e', '##yi', 'abu', '##ro', 'ni', 'mika', '##li', '.'], tokens) @@ -599,19 +599,19 @@ def test_bert_base_multilingual_yo(self): tokens = tokenizer.tokenize('ọmọbinrin') self.assertEqual(['omo', '##bin', '##rin'], tokens) - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Orúkọ ọmọbinrin rẹ̀ àgbà ni Merabu, ti èyí àbúrò ni Mikali.') self.assertEqual(['Or', '##ú', '##k', '##ọ', 'ọ', '##m', '##ọ', '##bin', '##rin', 'r', '##ẹ̀', 'à', '##g', '##bà', 'ni', 'Mer', '##abu', ',', 'ti', 'è', '##y', '##í', 'à', '##b', '##úr', '##ò', 'ni', 'Mika', '##li', '.'], tokens) tokens = tokenizer.tokenize('ọmọbinrin') self.assertEqual(['ọ', '##m', '##ọ', '##bin', '##rin'], tokens) - + def test_xlmr_base_multilingual_yo(self): """ Yoruba """ - tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') + tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('Orúkọ ọmọbinrin rẹ̀ àgbà ni Merabu, ti èyí àbúrò ni Mikali.') self.assertEqual(['▁O', 'rú', 'k', 'ọ', '▁', 'ọ', 'm', 'ọ', 'bin', 'rin', '▁r', 'ẹ', '̀', '▁à', 'gb', 'à', '▁ni', '▁Mera', 'bu', ',', '▁ti', '▁è', 'y', 'í', '▁à', 'bú', 'rò', '▁ni', '▁Mi', 'kali', '.'], tokens) @@ -620,11 +620,11 @@ def test_xlmr_base_multilingual_yo(self): self.assertEqual(['▁', 'ọ', 'm', 'ọ', 'bin', 'rin'], tokens) def test_doc2query(self): - tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco', legacy=True) + tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco', legacy=True, clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('I have a new GPU!') self.assertEqual(['▁I', '▁have', '▁', 'a', '▁new', '▁GPU', '!'], tokens) - tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco', legacy=True) + tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco', legacy=True, clean_up_tokenization_spaces=True) tokens = tokenizer.tokenize('walking talking biking scrolling') self.assertEqual(['▁walking', '▁talking', '▁biking', '▁scroll', 'ing'], tokens)