-
Notifications
You must be signed in to change notification settings - Fork 4
/
pooledout_to_train.py
45 lines (32 loc) · 1.27 KB
/
pooledout_to_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import argparse
import random
import jsonlines
import json
import re
random.seed(42)
#
# config
#
parser = argparse.ArgumentParser()
parser.add_argument('--train-dir', action='store', dest='train_dir',
help='training file directory location', required=True)
args = parser.parse_args()
#train_dir = '/mnt/c/Users/sophi/Documents/phd/data/coliee2019/task1/task1_train/train.json'
#train_org_dir = '/mnt/c/Users/sophi/Documents/phd/data/coliee2019/task1/task1_train/train_org.json'
# open the train org document and create a match of guid and the label
labels_list = []
with open(os.path.join(args.train_dir, 'train_org.json')) as f:
for line in f:
labels_list.append(json.loads(line))
match = {}
for label in labels_list:
match.update({label.get('guid'): label.get('label')})
with jsonlines.open(os.path.join(args.train_dir, 'train_poolout_totrain.json'), mode='w') as writer:
with open(os.path.join(args.train_dir, 'train_poolout.json'), 'r') as f:
for line in f:
sample = json.loads(line)
guid = sample.get('guid')
writer.write({'guid': guid,
'res': sample.get('res'),
'label': match.get(guid)})