Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
StephAO committed Mar 4, 2020
2 parents 989b546 + b38f5e9 commit 0e81a05
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 122 deletions.
34 changes: 22 additions & 12 deletions arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ def add_training_args(parser):
'with larger models and sequences')
group.add_argument('--clip-grad', type=float, default=1.0,
help='gradient clipping')
group.add_argument('--epochs', type=int, default=8,
group.add_argument('--epochs', type=int, default=10,
help='upper epoch limit')
group.add_argument('--log-interval', type=int, default=100000,
group.add_argument('--log-interval', type=int, default=1000000,
help='report interval')
group.add_argument('--train-iters', type=int, default=1000000,
help='number of iterations per epoch')
group.add_argument('--train-tokens', type=int, default=500000000,
group.add_argument('--train-tokens', type=int, default=1000000000,
help='number of tokens per epoch')
group.add_argument('--seed', type=int, default=1234,
help='random seed')
Expand All @@ -112,23 +112,23 @@ def add_training_args(parser):
help='Output directory to save checkpoints to.')
group.add_argument('--save-iters', type=int, default=None,
help='Save every so often iterations.')
group.add_argument('--save-optim', action='store_true',
group.add_argument('--save-optim', default=True,
help='Save current optimizer.')
group.add_argument('--save-rng', action='store_true',
group.add_argument('--save-rng', default=True,
help='Save current rng state.')
group.add_argument('--save-all-rng', action='store_true',
group.add_argument('--save-all-rng', default=True,
help='Save current rng state of each rank in '
'distributed training.')
group.add_argument('--load', type=str, default=None,
help='Path to a particular model checkpoint. \
(ex. `savedir/model.1000.pt`)')
group.add_argument('--load-optim', action='store_true',
group.add_argument('--load-optim', default=True,
help='Load most recent optimizer corresponding '
'to `--load`.')
group.add_argument('--load-rng', action='store_true',
group.add_argument('--load-rng', default=True,
help='Load most recent rng state corresponding '
'to `--load`.')
group.add_argument('--load-all-rng', action='store_true',
group.add_argument('--load-all-rng', default=True,
help='Load most recent rng state of each rank in '
'distributed training corresponding to `--load`('
'complementary to `--save-all-rng`).')
Expand All @@ -151,7 +151,10 @@ def add_training_args(parser):
group.add_argument('--incremental', type=str2bool, nargs='?',
const=True, default=False,
help='If true, each epoch add a new loss. If false, all losses are enabled from the start.')
group.add_argument('--new-old', type=str2bool, nargs='?',
group.add_argument('--continual-learning', type=str2bool, nargs='?',
const=True, default=False,
help='If true, train new and old losses separately.')
group.add_argument('--always-mlm', type=str2bool, nargs='?',
const=True, default=False,
help='If true, train new and old losses separately.')
group.add_argument('--no-aux', action='store_true',
Expand All @@ -170,7 +173,7 @@ def add_evaluation_args(parser):
group.add_argument('--eval-iters', type=int, default=2000,
help='number of iterations per epoch to run '
'validation/test for')
group.add_argument('--eval-tokens', type=int, default=5000000, #00,
group.add_argument('--eval-tokens', type=int, default=1000000, #00,
help='number of tokens per epoch to run '
'validation/test for')
group.add_argument('--eval-seq-length', type=int, default=None,
Expand Down Expand Up @@ -286,17 +289,24 @@ def get_args():
m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$',
open('/proc/self/status').read())
nw = bin(int(m.group(1).replace(',', ''), 16)).count('1')
args.num_workers = int(0.85 * nw) # leave 1 cpu for main process
args.num_workers = int(0.80 * nw) # leave cpu for main process

assert not ((args.continual_learning and args.alternating) or (args.continual_learning and args.incremental))

args.model_type += '_inc' if args.incremental else ''
args.model_type += '_alt' if args.alternating else ''
args.model_type += '_cmt' if args.continual_learning else ''
args.model_type += '+mlm' if args.always_mlm else ''
if args.save is None:
args.save = os.path.join(pretrained_path, args.model_type)

args.cuda = torch.cuda.is_available()
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))

args.save_all_rng = args.save_all_rng and args.world_size > 1
args.load_all_rng = args.load_all_rng and args.world_size > 1

args.dynamic_loss_scale = True

args.fp32_embedding = False
Expand Down
20 changes: 13 additions & 7 deletions data_utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,19 +500,18 @@ def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None
def __len__(self):
return self.ds_len

def set_args(self, modes, past_iters):
def set_args(self, modes):
# TODO: full training defined by number of tokens seen - not by number of iterations
print("setting up args, modes:", modes)
self.modes = modes
self.past_iters = past_iters
self.split_percent = 1.0
self.corruption_rate = 0.
self.num_sent_per_seq = 1
self.concat = False
self.shuffle = False
# Assert that at most 1 sentence distance loss exists
self.sentence_distances = ["nsp", "rg", "sd", "so"]
assert [x in self.sentence_distances for x in self.modes].count(True) <= 1
self.sentence_tasks = ["nsp", "psp", "sc", "rg", "sd", "so"]
assert [x in self.sentence_tasks for x in self.modes].count(True) <= 1
# Masked Language Data (Default)
self.mask_lm_prob = 0.15
self.task_id = self.task_dict[self.modes[-1]]
Expand Down Expand Up @@ -545,7 +544,7 @@ def __getitem__(self, idx):
# get rng state corresponding to index (allows deterministic random pair)
if idx >= self.ds_len:
raise StopIteration
rng = random.Random(idx) #idx + self.past_iters)
rng = random.Random(idx)
self.idx = idx
# get sentence pair and label
sentence_labels = None
Expand Down Expand Up @@ -830,7 +829,14 @@ def get_sentence(self, target_seq_length, num_sents, rng, non_contiguous=False,
idx = rng.randint(0, self.ds_len - 1)
doc = self.sentence_split(self.get_doc(idx))

print(doc)
# Get enough sentences for target length
if len(doc) < 2:
print(idx, doc, "YIKES")
print(self.ds.split_inds[idx])
self.ds.wrapped_data.set_flag()
print(self.ds.wrapped_data[self.ds.split_inds[idx]])
doc = self.sentence_split(self.get_doc(rng.randint(0, self.ds_len - 1)))
end_idx = rng.randint(0, len(doc) - 1)
start_idx = end_idx - 1
total_length = 0
Expand Down Expand Up @@ -859,8 +865,8 @@ def get_sentence(self, target_seq_length, num_sents, rng, non_contiguous=False,


if len(sentences) < num_sent_required:
print(doc)
print(len(sentences), num_sent_required)
print(idx, doc)
#print(len(sentences), num_sent_required)
# TODO get rid of this
#print(doc)
sentences = [self.sentence_tokenize("Data processing is hard."), self.sentence_tokenize("Sorry about the mistakes.")]
Expand Down
51 changes: 27 additions & 24 deletions data_utils/lazy_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pickle as pkl
import time
from itertools import accumulate
from threading import Lock
from filelock import FileLock

import torch

Expand Down Expand Up @@ -109,22 +109,19 @@ class lazy_array_loader(object):
"""
def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
lazypath = get_lazy_path(path)
datapath = os.path.join(lazypath, data_type)
#get file where array entries are concatenated into one big string
self._file = open(datapath, 'rb')
self.file = self._file
#memory map file if necessary
self.mem_map = mem_map
if self.mem_map:
self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
self.datapath = os.path.join(lazypath, data_type)
lenpath = os.path.join(lazypath, data_type+'.len.pkl')
self.lens = pkl.load(open(lenpath, 'rb'))
self.ends = list(accumulate(self.lens))
self.dumb_ends = list(self.ends)
self.read_lock = Lock()
self.read_lock = FileLock("bert_data_filelock.txt")
self.process_fn = map_fn
self.map_fn = map_fn
self._tokenizer = None
self.flag = False

def set_flag(self):
self.flag = True

def SetTokenizer(self, tokenizer):
"""
Expand Down Expand Up @@ -156,7 +153,10 @@ def __getitem__(self, index):
except OSError as e:
print(e)
print(index, start, end)
return None
return None
if self.flag:
print("------>", index, start, end, rtn)
self.flag = False
if self.map_fn is not None:
return self.map_fn(rtn)
else:
Expand All @@ -176,25 +176,28 @@ def __getitem__(self, index):
def __len__(self):
return len(self.ends)

def file_read(self, start=0, end=None):
def file_read(self, start=0, end=None, flag=False):
"""read specified portion of file"""

# atomic reads to avoid race conditions with multiprocess dataloader
self.read_lock.acquire()
# seek to start of file read
self.file.seek(start)
# read to end of file if no end point provided
if end is None:
rtn = self.file.read()
#else read amount needed to reach end point
else:
rtn = self.file.read(end-start)
self.read_lock.release()
with self.read_lock:
with open(self.datapath, 'rb') as f:
offset = 0
# seek to start of file read
while f.tell() != end or offset != start:
f.seek(start)
offset = f.tell()
rtn = f.read(end-start)
if f.tell() != end or offset != start:
print("Locking isn't working. Expected ({}, {}), Received: ({}, {}), Off by ({}, {})".format(start, end, offset, f.tell(), offset - start, f.tell() - end), flush=True)
if self.flag:
print("???", rtn)
print(f.tell(), f.read(25))
#TODO: @raulp figure out mem map byte string bug
#if mem map'd need to decode byte string to string
rtn = rtn.decode('utf-8')
# rtn = str(rtn)
if self.mem_map:
rtn = rtn.decode('unicode_escape')
#if self.mem_map:
# rtn = rtn.decode('unicode_escape')
return rtn

6 changes: 3 additions & 3 deletions evaluate/config/test_bert.conf
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ dropout = 0.1 // following BERT paper
optimizer = bert_adam
batch_size = 32
max_epochs = 3
lr = .00001
lr = .00002
min_lr = .0000001
lr_patience = 4
patience = 20
max_vals = 10000

// Tasks
pretrain_tasks = "qqp,rte,sst,sts-b" // glue
target_tasks = "qqp,rte,sst,sts-b" // glue
pretrain_tasks = glue
target_tasks = glue

// Control-flow stuff
do_pretrain = 0
Expand Down
Binary file modified idf.p
Binary file not shown.
Loading

0 comments on commit 0e81a05

Please sign in to comment.