-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
226 lines (179 loc) · 7.94 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""Download and store datasets."""
import collections
import io
import itertools
import pathlib
import os
import logging
import tarfile
import urllib.request
import tokenizer as tokenizr
_LOGGER = logging.getLogger(__name__)
def obtain(dataset_directory):
"""Download, store and encode datasets.
Args:
dataset_directory: where to store the materials.
Returns:
A dictionary of generators for various dataset splits, such as training
or eval.
"""
for dataset in itertools.chain(*_DATASETS_.values()):
download_if_not_there(dataset, dataset_directory)
tokenizer = create_tokenizer(_DATASETS_['train'], dataset_directory)
encoded_datasets = {}
for label, datasets in _DATASETS_.items():
full_corpus = read_corpus(datasets, dataset_directory)
encoded_datasets[label] = encode_corpus(
full_corpus, tokenizer, dataset_directory, prefix=label)
return encoded_datasets
Dataset = collections.namedtuple(
'Dataset', ('url', 'language_1_filename', 'language_2_filename'))
_DATASETS_ = {
'train': (
Dataset(url='http://data.statmt.org/wmt18/translation-task/' +
'training-parallel-nc-v13.tgz',
language_1_filename='training-parallel-nc-v13/' +
'news-commentary-v13.de-en.en',
language_2_filename='training-parallel-nc-v13/' +
'news-commentary-v13.de-en.de'),
Dataset(
url='http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
language_1_filename='commoncrawl.de-en.en',
language_2_filename='commoncrawl.de-en.de'),
Dataset(
url='http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
language_1_filename='training/europarl-v7.de-en.en',
language_2_filename='training/europarl-v7.de-en.de'),
),
'eval':
(Dataset(url='http://data.statmt.org/wmt17/translation-task/dev.tgz',
language_1_filename='dev/newstest2013.en',
language_2_filename='dev/newstest2013.de'), ),
}
def download_if_not_there(dataset, local_directory):
def does_exist(filename):
return (pathlib.Path(local_directory) / filename).exists()
if not (does_exist(dataset.language_1_filename) and
does_exist(dataset.language_2_filename)):
_LOGGER.info('Downloading %s', dataset.url)
download_and_extract(dataset, local_directory)
def download_and_extract(dataset, destination_directory):
"""Download a tar archive for the dataset and extract it to the local disk.
Args:
dataset: The dataset that is going to be downloaded. Only
files that match `dataset.language_pair` are going to be stored.
destination_directory: The directory where the archive is going to be
extracted. For example, `file1` inside the archive at
`http://example.com/archive.tar.gz` is going to be extracted to
`destination_directory/archive/file1`.
"""
with urllib.request.urlopen(dataset.url) as response:
download = response.read()
in_memory_file = io.BytesIO(download)
archive = tarfile.open(fileobj=in_memory_file)
def needed_files_only(tarinfo):
return tarinfo.name in (dataset.language_1_filename,
dataset.language_2_filename)
for untarred_file in filter(needed_files_only, archive.getmembers()):
archive.extract(untarred_file, path=destination_directory)
def create_tokenizer(datasets, directory):
"""Builds a new vocabulary if needed and returns a Tokenizer."""
vocabulary_file = pathlib.Path(directory) / tokenizr.Tokenizer.VOCAB_FILENAME
if vocabulary_file.exists():
_LOGGER.info('Re-using the vocabulary at %s', vocabulary_file)
return tokenizr.Builder.from_file(vocabulary_file)
samples = sample_corpus(datasets, directory,
sample_within_byte_budget=1e6)
tokenizer = tokenizr.Builder(2**15).from_corpus(samples)
tokenizer.store_to_file(vocabulary_file)
return tokenizer
def sample_corpus(datasets, directory, sample_within_byte_budget=1e6):
"""Sample the datasets.
Args:
datasets: Dataset objects that describe what to read.
directory: The local directory where the datasets are stored.
sample_within_byte_budget: Read this many bytes by sampling
across each file.
Returns:
Text lines are returned one by one.
"""
def read_lines(filename):
_LOGGER.info('Sampling from %s', filename)
with open(filename, 'r') as opened_file:
counter = 0
remaining_byte_budget = sample_within_byte_budget
file_size = os.path.getsize(filename)
for line in opened_file:
if counter < int(file_size / sample_within_byte_budget / 2):
counter += 1
else:
if remaining_byte_budget <= 0:
break
line = line.strip()
remaining_byte_budget -= len(line)
counter = 0
yield line
for dataset in datasets:
local_path = pathlib.Path(directory)
language_1_path = local_path / dataset.language_1_filename
language_2_path = local_path / dataset.language_2_filename
for path in [language_1_path, language_2_path]:
for line in read_lines(path):
yield line
def encode_corpus(corpus, tokenizer, dataset_directory, prefix=None):
"""Encode the whole corpus and store it locally.
Args:
corpus: Iterable of dict{inputs,targets} that represent the full
corpus.
tokenizer: Tokenizer instance that is going encode each sample.
dataset_directory: The local path where the encoded files are going to
stored. There is going to be a separate file for inputs and targets.
If the encoded files are already present then the function is not
going to repeat the process.
prefix: An optional prefix that can be appeneded to the filenames.
Returns:
A generator that reads from all produced files for further processing.
"""
dataset_directory = pathlib.Path(dataset_directory)
inputs_file_path = dataset_directory / (prefix + '-inputs')
targets_file_path = dataset_directory / (prefix + '-targets')
if inputs_file_path.exists() and targets_file_path.exists():
_LOGGER.info('Re-using encoded dataset files in %s: %s and %s.',
dataset_directory, inputs_file_path, targets_file_path)
else: # We are going to have to re-encode the whole corpus.
eos_id = tokenizer.RESERVED_TOKENS.index(tokenizer.EOS)
with open(inputs_file_path, 'w') as inputs_file, \
open(targets_file_path, 'w') as targets_file:
samples_written = 0
for sample in corpus:
inputs = tokenizer.encode(sample['inputs']) + [eos_id]
targets = tokenizer.encode(sample['targets']) + [eos_id]
inputs_file.write(' '.join([str(i) for i in inputs]) + '\n')
targets_file.write(' '.join([str(t) for t in targets]) + '\n')
samples_written += 1
if samples_written % 100000 == 0:
_LOGGER.info('Encoded %d samples so far...', samples_written)
_LOGGER.info('Encoded %d %s samples total.', samples_written, prefix)
for inputs, targets in zip(_read_lines(inputs_file_path),
_read_lines(targets_file_path)):
yield {'inputs': inputs, 'targets': targets}
def read_corpus(datasets, directory):
"""Read the datasets.
Args:
datasets: Dataset objects that describe what to read.
directory: The local directory where the datasets are stored.
Returns:
A dictionary with input and targets is going to be returned.
"""
for dataset in datasets:
local_path = pathlib.Path(directory)
language_1_path = local_path / dataset.language_1_filename
language_2_path = local_path / dataset.language_2_filename
for inputs, targets in zip(_read_lines(language_1_path),
_read_lines(language_2_path)):
yield {'inputs': inputs, 'targets': targets}
def _read_lines(filename):
_LOGGER.info('Reading from %s', filename)
with open(filename, 'r') as opened_file:
for line in opened_file:
yield line.strip()