Skip to content

Commit

Permalink
Refactor scripts for generating TREC-COVID baselines (rounds 3, 4, 5) (
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool authored Aug 5, 2020
1 parent bd8fff8 commit 99092a8
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 413 deletions.
140 changes: 140 additions & 0 deletions src/main/python/trec-covid/covid_baseline_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,146 @@
import pyserini.util


def perform_runs(round_number, indexes):
base_topics = f'src/main/resources/topics-and-qrels/topics.covid-round{round_number}.xml'
udel_topics = f'src/main/resources/topics-and-qrels/topics.covid-round{round_number}-udel.xml'

# Use cumulative qrels from previous round for relevance feedback runs
cumulative_qrels = f'src/main/resources/topics-and-qrels/qrels.covid-round{round_number - 1}-cumulative.txt'

print('')
print('## Running on abstract index...')
print('')

abstract_index = indexes[0]
abstract_prefix = f'anserini.covid-r{round_number}.abstract'
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{abstract_prefix}.qq.bm25.txt -runtag {abstract_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{abstract_prefix}.qdel.bm25.txt -runtag {abstract_prefix}.qdel.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query -removedups ' +
f'-bm25 -rm3 -rm3.fbTerms 100 -hits 10000 ' +
f'-rf.qrels {cumulative_qrels} ' +
f'-output runs/{abstract_prefix}.qdel.bm25+rm3Rf.txt -runtag {abstract_prefix}.qdel.bm25+rm3Rf.txt')

print('')
print('## Running on full-text index...')
print('')

full_text_index = indexes[1]
full_text_prefix = f'anserini.covid-r{round_number}.full-text'
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{full_text_prefix}.qq.bm25.txt -runtag {full_text_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{full_text_prefix}.qdel.bm25.txt -runtag {full_text_prefix}.qdel.bm25.txt')

print('')
print('## Running on paragraph index...')
print('')

paragraph_index = indexes[2]
paragraph_prefix = f'anserini.covid-r{round_number}.paragraph'
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -strip_segment_id -bm25 -hits 50000 ' +
f'-output runs/{paragraph_prefix}.qq.bm25.txt -runtag {paragraph_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -strip_segment_id -bm25 -hits 50000 ' +
f'-output runs/{paragraph_prefix}.qdel.bm25.txt -runtag {paragraph_prefix}.qdel.bm25.txt')


def perform_fusion(round_number, run_checksums, check_md5=True):
print('')
print('## Performing fusion...')
print('')

fusion_run1 = f'anserini.covid-r{round_number}.fusion1.txt'
set1 = [f'anserini.covid-r{round_number}.abstract.qq.bm25.txt',
f'anserini.covid-r{round_number}.full-text.qq.bm25.txt',
f'anserini.covid-r{round_number}.paragraph.qq.bm25.txt']

print(f'Performing fusion to create {fusion_run1}')
os.system('PYTHONPATH=../pyserini ' +
'python -m pyserini.fusion --method rrf --runtag reciprocal_rank_fusion_k=60 --k 10000 '
f'--out runs/{fusion_run1} --runs runs/{set1[0]} runs/{set1[1]} runs/{set1[2]}')

if check_md5:
assert pyserini.util.compute_md5(f'runs/{fusion_run1}') == run_checksums[fusion_run1],\
f'Error in producing {fusion_run1}!'

fusion_run2 = f'anserini.covid-r{round_number}.fusion2.txt'
set2 = [f'anserini.covid-r{round_number}.abstract.qdel.bm25.txt',
f'anserini.covid-r{round_number}.full-text.qdel.bm25.txt',
f'anserini.covid-r{round_number}.paragraph.qdel.bm25.txt']

print(f'Performing fusion to create {fusion_run2}')
os.system('PYTHONPATH=../pyserini ' +
'python -m pyserini.fusion --method rrf --runtag reciprocal_rank_fusion_k=60 --k 10000 ' +
f'--out runs/{fusion_run2} --runs runs/{set2[0]} runs/{set2[1]} runs/{set2[2]}')

if check_md5:
assert pyserini.util.compute_md5(f'runs/{fusion_run2}') == run_checksums[fusion_run2],\
f'Error in producing {fusion_run2}!'


def prepare_final_submissions(round_number, run_checksums, check_md5=True):
# Remove teh cumulative qrels from the previous round.
qrels = f'src/main/resources/topics-and-qrels/qrels.covid-round{round_number - 1}-cumulative.txt'

print('')
print('## Preparing final submission files by removing qrels...')
print('')

run1 = f'anserini.final-r{round_number}.fusion1.txt'
print(f'Generating {run1}')
os.system(f'python tools/scripts/filter_run_with_qrels.py --discard --qrels {qrels} ' +
f'--input runs/anserini.covid-r{round_number}.fusion1.txt --output runs/{run1} ' +
f'--runtag r{round_number}.fusion1')
run1_md5 = pyserini.util.compute_md5(f'runs/{run1}')

if check_md5:
assert run1_md5 == run_checksums[run1], f'Error in producing {run1}!'

run2 = f'anserini.final-r{round_number}.fusion2.txt'
print(f'Generating {run2}')
os.system(f'python tools/scripts/filter_run_with_qrels.py --discard --qrels {qrels} ' +
f'--input runs/anserini.covid-r{round_number}.fusion2.txt --output runs/{run2} ' +
f'--runtag r{round_number}.fusion2')
run2_md5 = pyserini.util.compute_md5(f'runs/{run2}')

if check_md5:
assert run2_md5 == run_checksums[run2], f'Error in producing {run2}!'

run3 = f'anserini.final-r{round_number}.rf.txt'
print(f'Generating {run3}')
os.system(f'python tools/scripts/filter_run_with_qrels.py --discard --qrels {qrels} ' +
f'--input runs/anserini.covid-r{round_number}.abstract.qdel.bm25+rm3Rf.txt ' +
f'--output runs/{run3} --runtag r{round_number}.rf')
run3_md5 = pyserini.util.compute_md5(f'runs/{run3}')

if check_md5:
assert run3_md5 == run_checksums[run3], f'Error in producing {run3}!'

print('')
print(f'{run1:<35}{run1_md5}')
print(f'{run2:<35}{run2_md5}')
print(f'{run3:<35}{run3_md5}')


def evaluate_run(run, qrels):
metrics = {}
output = subprocess.check_output(
Expand Down
141 changes: 7 additions & 134 deletions src/main/python/trec-covid/generate_round3_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,11 @@

"""Perform Anserini baseline runs for TREC-COVID Round 3."""

import hashlib
import os
import sys

from covid_baseline_tools import evaluate_runs, verify_stored_runs

sys.path.insert(0, './')
sys.path.insert(0, '../pyserini/')

from pyserini.util import compute_md5, download_url

from covid_baseline_tools import perform_runs, perform_fusion, prepare_final_submissions,\
evaluate_runs, verify_stored_runs

# This makes errors more readable,
# see https://stackoverflow.com/questions/27674602/hide-traceback-unless-a-debug-flag-is-set
Expand Down Expand Up @@ -91,143 +85,22 @@
}


def perform_runs():
base_topics = 'src/main/resources/topics-and-qrels/topics.covid-round3.xml'
udel_topics = 'src/main/resources/topics-and-qrels/topics.covid-round3-udel.xml'

print('')
print('## Running on abstract index...')
print('')

abstract_index = indexes[0]
abstract_prefix = 'anserini.covid-r3.abstract'
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{abstract_prefix}.qq.bm25.txt -runtag {abstract_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{abstract_prefix}.qdel.bm25.txt -runtag {abstract_prefix}.qdel.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query -removedups ' +
f'-bm25 -rm3 -rm3.fbTerms 100 -hits 10000 ' +
f'-rf.qrels src/main/resources/topics-and-qrels/qrels.covid-round12.txt ' +
f'-output runs/{abstract_prefix}.qdel.bm25+rm3Rf.txt -runtag {abstract_prefix}.qdel.bm25+rm3Rf.txt')

print('')
print('## Running on full-text index...')
print('')

full_text_index = indexes[1]
full_text_prefix = 'anserini.covid-r3.full-text'
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{full_text_prefix}.qq.bm25.txt -runtag {full_text_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-output runs/{full_text_prefix}.qdel.bm25.txt -runtag {full_text_prefix}.qdel.bm25.txt')

print('')
print('## Running on paragraph index...')
print('')

paragraph_index = indexes[2]
paragraph_prefix = 'anserini.covid-r3.paragraph'
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -strip_segment_id -bm25 -hits 50000 ' +
f'-output runs/{paragraph_prefix}.qq.bm25.txt -runtag {paragraph_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -strip_segment_id -bm25 -hits 50000 ' +
f'-output runs/{paragraph_prefix}.qdel.bm25.txt -runtag {paragraph_prefix}.qdel.bm25.txt')


def perform_fusion():
print('')
print('## Performing fusion...')
print('')

fusion_run1 = 'anserini.covid-r3.fusion1.txt'
set1 = ['anserini.covid-r3.abstract.qq.bm25.txt',
'anserini.covid-r3.full-text.qq.bm25.txt',
'anserini.covid-r3.paragraph.qq.bm25.txt']

print(f'Performing fusion to create {fusion_run1}')
os.system('PYTHONPATH=../pyserini ' +
'python -m pyserini.fusion --method rrf --runtag reciprocal_rank_fusion_k=60 --k 10000 ' +
f'--out runs/{fusion_run1} --runs runs/{set1[0]} runs/{set1[1]} runs/{set1[2]}')

assert compute_md5(f'runs/{fusion_run1}') == cumulative_runs[fusion_run1], f'Error in producing {fusion_run1}!'

fusion_run2 = 'anserini.covid-r3.fusion2.txt'
set2 = ['anserini.covid-r3.abstract.qdel.bm25.txt',
'anserini.covid-r3.full-text.qdel.bm25.txt',
'anserini.covid-r3.paragraph.qdel.bm25.txt']

print(f'Performing fusion to create {fusion_run2}')
os.system('PYTHONPATH=../pyserini ' +
'python -m pyserini.fusion --method rrf --runtag reciprocal_rank_fusion_k=60 --k 10000 ' +
f'--out runs/{fusion_run2} --runs runs/{set2[0]} runs/{set2[1]} runs/{set2[2]}')

assert compute_md5(f'runs/{fusion_run2}') == cumulative_runs[fusion_run2], f'Error in producing {fusion_run2}!'


def prepare_final_submissions(qrels):
print('')
print('## Preparing final submission files by removing qrels...')
print('')

run1 = 'anserini.final-r3.fusion1.txt'
print(f'Generating {run1}')
os.system(f'python tools/scripts/filter_run_with_qrels.py --discard --qrels {qrels} ' +
f'--input runs/anserini.covid-r3.fusion1.txt --output runs/{run1} --runtag r3.fusion1')
run1_md5 = compute_md5(f'runs/{run1}')
assert run1_md5 == final_runs[run1], f'Error in producing {run1}!'

run2 = 'anserini.final-r3.fusion2.txt'
print(f'Generating {run2}')
os.system(f'python tools/scripts/filter_run_with_qrels.py --discard --qrels {qrels} ' +
f'--input runs/anserini.covid-r3.fusion2.txt --output runs/{run2} --runtag r3.fusion2')
run2_md5 = compute_md5(f'runs/{run2}')
assert run2_md5 == final_runs[run2], f'Error in producing {run2}!'

run3 = 'anserini.final-r3.rf.txt'
print(f'Generating {run3}')
os.system(f'python tools/scripts/filter_run_with_qrels.py --discard --qrels {qrels} ' +
f'--input runs/anserini.covid-r3.abstract.qdel.bm25+rm3Rf.txt --output runs/{run3} --runtag r3.rf')
run3_md5 = compute_md5(f'runs/{run3}')
assert run3_md5 == final_runs[run3], f'Error in producing {run3}!'

print('')
print(f'{run1:<35}{run1_md5}')
print(f'{run2:<35}{run2_md5}')
print(f'{run3:<35}{run3_md5}')


def main():
if not (os.path.isdir(indexes[0]) and os.path.isdir(indexes[1]) and os.path.isdir(indexes[2])):
print('Required indexes do not exist. Please download first.')

os.system('cat src/main/resources/topics-and-qrels/qrels.covid-round1.txt ' +
'src/main/resources/topics-and-qrels/qrels.covid-round2.txt ' +
'> src/main/resources/topics-and-qrels/qrels.covid-round12.txt')
'> src/main/resources/topics-and-qrels/qrels.covid-round2-cumulative.txt')

round2_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round12.txt'
round2_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round2-cumulative.txt'
round3_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round3.txt'
round3_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt'

verify_stored_runs(stored_runs)
perform_runs()
perform_fusion()
prepare_final_submissions(round2_cumulative_qrels)
perform_runs(3, indexes)
perform_fusion(3, cumulative_runs, check_md5=True)
prepare_final_submissions(3, final_runs)

evaluate_runs(round2_cumulative_qrels, cumulative_runs)
evaluate_runs(round3_cumulative_qrels, cumulative_runs)
Expand Down
Loading

0 comments on commit 99092a8

Please sign in to comment.