Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scripts to create MS MARCO v1 Document Jsonl Collection #905

Merged
merged 9 commits into from
Dec 14, 2021
Prev Previous commit
Next Next commit
update ms marco doc segmented collection creation script
  • Loading branch information
ronakice committed Dec 8, 2021
commit 8da7f2fefcc055173454f02f50de8e786272947d
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def generate_output_dict(doc):
f_corpus = gzip.open(args.original_docs_path, mode='rt')
f_out = open(args.output_docs_path, 'w')

print('Appending predictions...')
for line in tqdm(f_corpus):
output_dict = generate_output_dict(line.split('\t'))
f_out.write(json.dumps(output_dict) + '\n')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def generate_output_dicts(doc, nlp, max_length, stride):
segment = ' '.join(sentences[i:i + max_length])
doc_text = f'{doc_url}\n{doc_title}\n{segment}'
output_dicts.append({'id': f'{doc_id}#{i}', 'contents': doc_text})
return output_dict
return output_dicts


if __name__ == '__main__':
Expand All @@ -38,7 +38,7 @@ def generate_output_dicts(doc, nlp, max_length, stride):
f_corpus = gzip.open(args.original_docs_path, mode='rt')
f_out = open(args.output_docs_path, 'w')

print('Appending predictions...')
print('Creating collection...')
for line in tqdm(f_corpus):
output_dicts = generate_output_dicts(line.split('\t'), nlp, args.max_length, args.stride)
for output_dict in output_dicts:
Expand Down