This repository has been archived by the owner on Sep 1, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathget_multinli.py
51 lines (40 loc) · 1.66 KB
/
get_multinli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
from tqdm import tqdm
import jsonlines
import spacy
def main():
root_path = os.path.join('data', 'datasets')
if not os.path.exists(root_path):
os.makedirs(root_path)
dir_name = 'multinli_1.0'
url = 'https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip'
filename = url.split('/')[-1]
os.chdir(root_path)
os.system('wget %s' % url)
os.system('unzip %s' % filename)
os.system('rm %s' % filename)
filenames = ['multinli_1.0_dev_mismatched', 'multinli_1.0_dev_matched', 'multinli_1.0_train']
spacy_nlp = spacy.load('en')
for filename in filenames:
p = os.path.join(dir_name, filename)
print(p)
with jsonlines.open(p + '.jsonl', mode='r') as f_r, jsonlines.open(p + '.tokenized.jsonl', mode='w') as f_w:
progress = tqdm(f_r, mininterval=1, leave=False)
for line in progress:
sentence1, sentence2 = line['sentence1'], line['sentence2']
sentence1_spacy = ' '.join([t.text for t in spacy_nlp(sentence1)])
sentence2_spacy = ' '.join([t.text for t in spacy_nlp(sentence2)])
f_w.write({'sentence1': sentence1_spacy, 'sentence2': sentence2_spacy, 'label': line['gold_label'],
'genre': line['genre']})
if __name__ == '__main__':
main()