Run anserini+BM25 baseline on PubMed and DBLP (#765)

* add script to convert and run pubmed and dblp * updated openresearch docs to add comparison table; update key terms result
castorini · Aug 10, 2019 · 5b29d16 · 5b29d16
1 parent c429e29
commit 5b29d16
Show file tree

Hide file tree

Showing 7 changed files with 502 additions and 15 deletions.
diff --git a/docs/experiments-openresearch.md b/docs/experiments-openresearch.md
@@ -124,11 +124,11 @@ recall_1000           	all	0.3628
 The output of using key terms in title and abstract as query should be:
 
 ```
-map                   	all	0.0412
-recip_rank            	all	0.2521
-P_20                  	all	0.0546
-recall_20             	all	0.0790
-recall_1000           	all	0.2818
+map                     all 0.0528
+recip_rank              all 0.2202
+P_20                    all 0.0428
+recall_20               all 0.1022
+recall_1000             all 0.3344
 ```
 
 
@@ -139,6 +139,103 @@ The table below compares our BM25 results against Bhagavatula's et. al (2018):
 | BM25 (Bhagavatula et. al, 2018) | 0.058 | 0.218 |
 | BM25 (Anserini, Ours, title)    | 0.063 | 0.244 |
 | BM25 (Anserini, Ours, title+abstract)| 0.095 | 0.351 |
-| BM25 (Anserini, Ours, key terms)| 0.065 | 0.251 |
+| BM25 (Anserini, Ours, key terms)| 0.060 | 0.220 |
 
 
+## Extra Baseline on PubMed and DBLP
+
+### PubMed and DBLP dataset
+
+Follow [citeomatic's repo](https://github.com/allenai/citeomatic/tree/44dc210c82515b5d4c5a96f5aafcb9b6e48206af) to download the necessary data.
+
+The steps are similar to run baseline on OpenResearch, to run all three experiments on PubMed and DBLP quickly, run
+
+`./src/main/python/openresearch/run_pubmed_dblp.sh -citeomatic_data <YOUR CITEOMATIC_DATA_ROOT> -output_folder <YOUR_OUTPUT_FOLDER>`
+
+The results are as follows:
+
+The output of using PubMed title as query
+
+```
+map                     all     0.1615
+recip_rank              all     0.5844
+P_20                    all     0.2034
+recall_20               all     0.1954
+recall_1000             all     0.6536
+f1_20                   all     0.199
+```
+
+The output of using PubMed key terms from title and abstract as query
+
+```
+map                     all     0.1637
+recip_rank              all     0.5953
+P_20                    all     0.2058
+recall_20               all     0.1969
+recall_1000             all     0.6041
+f1_20                   all     0.201
+```
+
+The output of using PubMed title + abstract as query
+
+```
+map                     all     0.2361
+recip_rank              all     0.7208
+P_20                    all     0.2726
+recall_20               all     0.2632
+recall_1000             all     0.7649
+f1_20                   all     0.268
+```
+
+The output of using DBLP title as query
+
+```
+map                     all     0.1056
+recip_rank              all     0.4244
+P_20                    all     0.1090
+recall_20               all     0.1721
+recall_1000             all     0.5511
+f1_20                   all     0.133
+```
+
+The output of using DBLP key terms from title and abstract as query
+
+```
+map                     all     0.1015
+recip_rank              all     0.4254
+P_20                    all     0.1059
+recall_20               all     0.1669
+recall_1000             all     0.5099
+f1_20                   all     0.130
+```
+
+The output of using DBLP title + abstract as query
+
+```
+map                     all     0.1687
+recip_rank              all     0.5851
+P_20                    all     0.1586
+recall_20               all     0.2511
+recall_1000             all     0.6913
+f1_20                   all     0.194
+```
+
+The table below compares our BM25 results against Bhagavatula's et. al (2018):
+
+**PubMed**
+
+|                                 | F1@20 |  MRR  |
+|----------|:-------------:|------:|
+| BM25 (Bhagavatula et. al, 2018) | 0.209 | 0.574 |
+| BM25 (Anserini, Ours, title)    | 0.199 | 0.584 |
+| BM25 (Anserini, Ours, key terms)| 0.201 | 0.595 |
+| BM25 (Anserini, Ours, title+abstract)| 0.268 | 0.720|
+
+**DBLP**
+
+|                                 | F1@20 |  MRR  |
+|----------|:-------------:|------:|
+| BM25 (Bhagavatula et. al, 2018) | 0.119 | 0.425 |
+| BM25 (Anserini, Ours, title)    | 0.133 | 0.424 |
+| BM25 (Anserini, Ours, key terms)| 0.130 | 0.425 |
+| BM25 (Anserini, Ours, title+abstract)| 0.194 | 0.585 |
diff --git a/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py b/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py
@@ -9,6 +9,29 @@
 from whoosh.fields import *
 
 
+def get_id_years(file_paths):
+  print('Collecting paper ids and their publication years...')
+  id_years = []
+  for file_num, file_path in enumerate(file_paths):
+    with gzip.open(file_path) as f:
+      for line_num, line in enumerate(f):
+        obj = json.loads(line.strip())
+        doc_id = obj['id']
+        if 'year' not in obj:
+          continue
+        year = int(obj['year'])
+
+        id_years.append((doc_id, year))
+        if line_num % 100000 == 0:
+          print('Processed {} lines. Collected {} docs.'.format(
+              line_num + 1, len(id_years)))
+
+  print('Sorting papers by year...')
+  id_years.sort(key = lambda x: x[1])
+  id_years = {id: year for id, year in id_years}
+  return id_years
+
+
 def create_dataset(args):
     print('Converting data...')
 
@@ -35,14 +58,15 @@ def create_dataset(args):
     whoosh_index = create_in(args.whoosh_index, schema)
     writer = whoosh_index.writer()
 
+    id_years = get_id_years(file_paths)
+    doc_ids = set(id_years.keys())
     line_num = 0
     start_time = time.time()
     for file_num, file_path in enumerate(file_paths):
         with gzip.open(file_path) as f:
             for line in f:
                 obj = json.loads(line.strip())
                 doc_id = obj['id']
-
                 writer.add_document(id=doc_id, title=obj['title'], abstract=obj['paperAbstract'])
                 line_num += 1
                 if line_num % 100000 == 0:

diff --git a/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py b/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py
@@ -0,0 +1,196 @@
+import argparse
+import gzip
+import json
+import os
+import time
+from collections import defaultdict
+
+
+def clean(text):
+    return text.replace('\n', ' ').replace('\t', ' ')
+
+
+def get_ids(start, end, year_ids):
+    result = []
+    for year in range(start, end+1):
+        result.extend(year_ids[year])
+    return set(result)
+
+
+def get_id_years(file_name, data_type):
+    print('Collecting paper ids and their publication years...')
+    year_ids = defaultdict(list)
+    with open(file_name) as f:
+        for line_num, line in enumerate(f):
+            obj = json.loads(line.strip())
+            doc_id = obj['id']
+            if 'year' not in obj:
+                continue
+            year = int(obj['year'])
+
+            year_ids[year].append(doc_id)
+            if line_num % 1000000 == 0:
+                print('Processed {} lines. Collected {} docs.'.format(
+                    line_num + 1, len(year_ids)))
+
+    train_ranges = {'dblp': (1966, 2007), 'pubmed': (1966, 2008)}
+    dev_ranges = {'dblp': (2008, 2008), 'pubmed': (2009, 2009)}
+    test_ranges = {'dblp': (2009, 2011), 'pubmed': (2010, 2013)}
+
+    train_ids = get_ids(train_ranges[data_type][0], train_ranges[data_type][1], year_ids)
+    dev_ids = get_ids(dev_ranges[data_type][0], dev_ranges[data_type][1], year_ids)
+    test_ids = get_ids(test_ranges[data_type][0], test_ranges[data_type][1], year_ids)
+
+    num_train = len(train_ids)
+    num_dev = len(dev_ids)
+    num_test = len(test_ids)
+
+    print('Collected {}, {}, {} papers for training, dev, and test sets.'.format(
+        num_train, num_dev, num_test))
+
+    return train_ids, dev_ids, test_ids, year_ids
+
+
+def create_dataset(args):
+    print('Converting data...')
+    queries_files = {}
+    qrels_files = {}
+    for set_name in ['train', 'dev', 'test']:
+        queries_filepath = os.path.join(
+            args.output_folder, 'queries.{}.tsv'.format(set_name))
+        qrels_filepath = os.path.join(
+            args.output_folder, 'qrels.{}'.format(set_name))
+        queries_files[set_name] = open(queries_filepath, 'w')
+        qrels_files[set_name] = open(qrels_filepath, 'w')
+
+    file_name = os.path.join(args.collection_path, 'corpus.json')
+
+    train_ids, dev_ids, test_ids, year_ids = get_id_years(
+        file_name=file_name, data_type=args.data_type)
+
+    doc_ids = train_ids | dev_ids | test_ids
+
+    # Write train_ids to file for future use
+    candidates_file = open(os.path.join(args.output_folder, 'candidates.txt'), 'w')
+    for train_id in train_ids:
+        candidates_file.write(train_id+'\n')
+
+    id_years = {}
+    for y in year_ids:
+        for i in year_ids[y]:
+            id_years[i] = y
+
+    n_docs = 0
+    file_index = 0
+    num_train = 0
+    num_dev = 0
+    num_test = 0
+    start_time = time.time()
+
+    with open(file_name) as f:
+        for line in f:
+            obj = json.loads(line.strip())
+            doc_id = obj['id']
+            if doc_id not in doc_ids:
+                continue
+            if n_docs % args.max_docs_per_file == 0:
+                if n_docs > 0:
+                    output_jsonl_file.close()
+                output_path = os.path.join(
+                    args.output_folder, 'corpus/docs{:02d}.json'.format(file_index))
+                output_jsonl_file = open(output_path, 'w')
+                file_index += 1
+            doc_text = '[Title]: {} [Abstract]: {}'.format(
+                obj['title'], obj['abstract'])
+            doc_text = clean(doc_text)
+            output_dict = {'id': doc_id, 'contents': doc_text}
+            output_jsonl_file.write(json.dumps(output_dict) + '\n')
+            n_docs += 1
+
+            out_citations = obj['out_citations']
+
+            # Remove citations not in the corpus.
+            out_citations = [
+            out_citation for out_citation in out_citations 
+            if out_citation in doc_ids
+            ]
+
+            # Remove self citations.
+            out_citations = [
+            out_citation for out_citation in out_citations 
+            if out_citation != doc_id
+            ]
+
+            # Use only citations that have an older publication year than the citing
+            # paper's or do not have an year.
+            out_citations2 = []
+            for out_citation in out_citations: 
+                if out_citation in id_years:
+                    if id_years[out_citation] <= obj['year']:
+                        out_citations2.append(out_citation)
+            out_citations = out_citations2
+
+            # Follow Bhagavatula's setting to restrict our citations candidates to train_ids only
+            out_citations = set(out_citations)
+            out_citations.intersection_update(train_ids)
+
+            # Skip papers have out citations < 10.
+            if len(out_citations) < 10:
+                continue
+
+            if doc_id in train_ids:
+                set_name = 'train'
+                num_train += 1  
+            elif doc_id in dev_ids:
+                set_name = 'dev'
+                num_dev += 1
+            elif doc_id in test_ids:
+                set_name = 'test'
+                num_test += 1
+
+            queries_file = queries_files[set_name]
+            qrels_file = qrels_files[set_name]
+
+            doc_title = obj['title']
+            doc_title = clean(doc_title)
+            if args.use_abstract_in_query:
+                doc_abstract = clean(obj['abstract'])
+                query = '[Title]: ' + doc_title + ' [Abstract]: ' + doc_abstract
+            else:
+                query = doc_title
+            queries_file.write('{}\t{}\n'.format(doc_id, query))
+            for out_citation in out_citations:
+                qrels_file.write('{} 0 {} 1\n'.format(doc_id, out_citation))
+
+        print('Examples: {} train, {} valid, {} test'.format(
+                    num_train, num_dev, num_test))
+
+    # Close queries and qrels files.
+    for queries_file in queries_files.values():
+        queries_file.close()
+    for qrels_file in qrels_files.values():
+        qrels_file.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Converts DBLP Corpus json collection to '
+        'Anserini\'s jsonl files.')
+    parser.add_argument('--collection_path', required=True, 
+        help='DBLP json collection file')
+    parser.add_argument('--output_folder', required=True, help='output file')
+    parser.add_argument('--max_docs_per_file', default=1000000, type=int, 
+        help='maximum number of documents in each jsonl file.')
+    parser.add_argument('--data_type', required=True, default='dblp', help='dblp or pubmed')
+    parser.add_argument('--use_abstract_in_query', action='store_true',
+        help='If True use title and a abstract as query. If '
+        'False, use only title.')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_folder):
+        os.makedirs(args.output_folder)
+        os.makedirs(os.path.join(args.output_folder, 'corpus'))
+
+    create_dataset(args)
+    print('Done!')