Skip to content

Commit

Permalink
added imagenet code, deleted transformer finetuning, deleted old GAN …
Browse files Browse the repository at this point in the history
…code
  • Loading branch information
jxbz committed Jun 9, 2020
1 parent 8594416 commit 34166ec
Show file tree
Hide file tree
Showing 430 changed files with 484 additions and 87,711 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
Copyright (c) 2018, Hanxiao Liu.
All rights reserved.

Apache License
Version 2.0, January 2004
Expand Down Expand Up @@ -199,4 +201,4 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
28 changes: 27 additions & 1 deletion classify-imagenet/README.md
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
Coming soon! 🧀
## Requirements
- [PyTorch](http://pytorch.org)
- [NVIDIA APEX](https://github.com/NVIDIA/apex#quick-start)

## Data Preparation
Download the ImageNet 2012 dataset and structure the dataset under
train and val subfloders. You can follow [this page](https://github.com/pytorch/examples/tree/master/imagenet#requirements)
to structure the dataset. The data directory should be in the form:

data/
├── train/
├── n01440764/
├── n01443537/
├── ...
├── val/
├── n01440764/
├── n01443537/
├── ...

## COMMANDS
```
cd classify-imagenet
python -m torch.distributed.launch --nproc_per_node=8 train_imagenet.py --data $DATA_DIR --results_dir $RESULTS_DIR \
--save $EXPR_NAME --optimizer fromage --learning_rate 1e-2 --seed 0
```
Above `$DATA_DIR` refers to the dataset directory path, `$RESULTS_DIR` is the results directory with `$EXPR_NAME` giving
a name for the experiment.
314 changes: 314 additions & 0 deletions classify-imagenet/train_imagenet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
"""
Copyright 2020 NVIDIA Corporation.
This file has been modified from a file with the same name in the
DARTS library (licensed under the Apache License, Version 2.0):
https://github.com/quark0/darts
The Apache License for the original version of this file can be
found in this directory. The modifications to this file are subject
to the CC BY-NC-SA 4.0 license located at the root directory.
"""

import argparse
import logging
import os
import sys

from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import to_python_float
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.utils
import torchvision.datasets as dset
import torchvision.transforms as transforms
import utils
sys.path.append("..") # Add fromage location to your PYTHON_PATH.
from fromage import Fromage


parser = argparse.ArgumentParser('imagenet')
# Dataset choices.
parser.add_argument('--data', type=str,
default='./imagenet_data/',
help='location of the data corpus')
# Optimization choices.
parser.add_argument('--batch_size', type=int, default=128,
help='batch size')
parser.add_argument('--optimizer', type=str, default='fromage',
choices=['fromage', 'SGD', 'adam'],
help='optimizer used for training.')
parser.add_argument('--learning_rate', type=float, default=1e-2,
help='init learning rate')
parser.add_argument('--momentum', type=float, default=0.0,
help='momentum')
parser.add_argument('--weight_decay', type=float, default=0.0,
help='weight decay')
parser.add_argument('--epochs', type=int, default=90,
help='num of training epochs')
parser.add_argument('--grad_clip', type=float,
default=5., help='gradient clipping')
parser.add_argument('--label_smooth', type=float,
default=0.1, help='label smoothing')
# Logging choices.
parser.add_argument('--report_freq', type=float, default=100,
help='report frequency')
parser.add_argument('--save', type=str, default='EXP',
help='experiment name')
parser.add_argument('--results_dir', type=str, default='./results/',
help='results directory')
# Misc.
parser.add_argument('--seed', type=int, default=0,
help='random seed')
# DDP.
parser.add_argument('--local_rank', type=int, default=0,
help='rank of process')

args = parser.parse_args()

# Set up DDP.
args.distributed = True
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.world_size = torch.distributed.get_world_size()


# Set up logging.
assert args.results_dir
args.save = args.results_dir + '/{}'.format(args.save)
if args.local_rank == 0:
utils.create_exp_dir(args.save)
logging = utils.Logger(args.local_rank, args.save)
writer = utils.Writer(args.local_rank, args.save)

CLASSES = 1000


class CrossEntropyLabelSmooth(nn.Module):
def __init__(self, num_classes, epsilon):
super(CrossEntropyLabelSmooth, self).__init__()
self.num_classes = num_classes
self.epsilon = epsilon
self.logsoftmax = nn.LogSoftmax(dim=1)

def forward(self, inputs, targets):
log_probs = self.logsoftmax(inputs)
targets = torch.zeros_like(log_probs).scatter_(
1, targets.unsqueeze(1), 1)
targets = (1 - self.epsilon) * targets + \
self.epsilon / self.num_classes
loss = (-targets * log_probs).mean(0).sum()
return loss


def main():
if not torch.cuda.is_available():
logging.info('no gpu device available')
sys.exit(1)

# set seeds
np.random.seed(args.seed)
cudnn.benchmark = True
torch.manual_seed(args.seed)
cudnn.enabled = True
torch.cuda.manual_seed(args.seed)
logging.info('args = %s', args)

# Get data loaders.
traindir = os.path.join(args.data, 'train')
validdir = os.path.join(args.data, 'val')

# data augmentation
normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(
brightness=0.4,
contrast=0.4,
saturation=0.4,
hue=0.2),
transforms.ToTensor(),
normalize,
])
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])

train_data = dset.ImageFolder(traindir, transform=train_transform)
valid_data = dset.ImageFolder(validdir, transform=val_transform)

# dataset split
valid_data, test_data = utils.dataset_split(valid_data, len(valid_data))

train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_queue = torch.utils.data.DataLoader(
train_data, batch_size=args.batch_size, shuffle=False,
pin_memory=True, num_workers=8, sampler=train_sampler)

valid_queue = torch.utils.data.DataLoader(
valid_data, batch_size=args.batch_size, shuffle=False,
pin_memory=True, num_workers=8)

test_queue = torch.utils.data.DataLoader(
test_data, batch_size=args.batch_size, shuffle=False,
pin_memory=True, num_workers=8)

# Create model and loss.
torch.hub.set_dir('/tmp/hub_cache_%d' % args.local_rank)
model = torch.hub.load('pytorch/vision:v0.4.2', 'resnet50', pretrained=False)
model = model.cuda()
model = DDP(model, delay_allreduce=True)

criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()
criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth)
criterion_smooth = criterion_smooth.cuda()

# Set up network weights optimizer.
if args.optimizer == 'SGD':
optimizer = torch.optim.SGD(
model.parameters(), args.learning_rate, momentum=args.momentum,
weight_decay=args.weight_decay)
elif args.optimizer == 'fromage':
optimizer = Fromage(
model.parameters(), args.learning_rate)
elif args.optimizer == 'adam':
optimizer = torch.optim.Adam(
model.parameters(), args.learning_rate, weight_decay=args.weight_decay)
else:
raise NotImplementedError

scheduler = torch.optim.lr_scheduler.StepLR(
optimizer, gamma=0.1, step_size=30)

# Train.
global_step = 0
best_acc_top1 = 0
for epoch in range(args.epochs):
# Shuffle the sampler, update lrs.
train_queue.sampler.set_epoch(epoch + args.seed)

# Training.
train_acc_top1, train_acc_top5, train_obj, global_step = train(
train_queue, model, criterion_smooth, optimizer, global_step)
logging.info('epoch %d train_acc %f', epoch, train_acc_top1)
writer.add_scalar('train/loss', train_obj, global_step)
writer.add_scalar('train/acc_top1', train_acc_top1, global_step)
writer.add_scalar('train/acc_top5', train_acc_top5, global_step)
writer.add_scalar('train/lr', optimizer.state_dict()[
'param_groups'][0]['lr'], global_step)

# Validation.
valid_acc_top1, valid_acc_top5, valid_obj = infer(
valid_queue, model, criterion)
logging.info('valid_acc_top1 %f', valid_acc_top1)
logging.info('valid_acc_top5 %f', valid_acc_top5)
writer.add_scalar('val/acc_top1', valid_acc_top1, global_step)
writer.add_scalar('val/acc_top5', valid_acc_top5, global_step)
writer.add_scalar('val/loss', valid_obj, global_step)

# Test
test_acc_top1, test_acc_top5, test_obj = infer(
test_queue, model, criterion)
logging.info('test_acc_top1 %f', test_acc_top1)
logging.info('test_acc_top5 %f', test_acc_top5)
writer.add_scalar('test/acc_top1', test_acc_top1, global_step)
writer.add_scalar('test/acc_top5', test_acc_top5, global_step)
writer.add_scalar('test/loss', test_obj, global_step)

is_best = False
if valid_acc_top1 > best_acc_top1:
best_acc_top1 = valid_acc_top1
is_best = True

if args.local_rank == 0:
utils.save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'best_acc_top1': best_acc_top1,
'optimizer': optimizer.state_dict(),
}, is_best, args.save)

# Update LR.
scheduler.step()

writer.flush()


def train(train_queue, model, criterion, optimizer, global_step):
objs = utils.AverageMeter()
top1 = utils.AverageMeter()
top5 = utils.AverageMeter()

model.train()
for step, (data, target) in enumerate(train_queue):
n = data.size(0)
data = data.cuda()
target = target.cuda()

# Forward.
optimizer.zero_grad()
logits = model(data)
loss = criterion(logits, target)

# Backward and step.
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
optimizer.step()

# Calculate the accuracy.
prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
prec1 = utils.reduce_tensor(prec1, args.world_size)
prec5 = utils.reduce_tensor(prec5, args.world_size)

objs.update(to_python_float(reduced_loss), n)
top1.update(to_python_float(prec1), n)
top5.update(to_python_float(prec5), n)

if (step + 1) % args.report_freq == 0:
current_lr = list(optimizer.param_groups)[0]['lr']
logging.info('train %03d %e %f %f lr: %e', step,
objs.avg, top1.avg, top5.avg, current_lr)
global_step += 1

return top1.avg, top5.avg, objs.avg, global_step


def infer(valid_queue, model, criterion):
objs = utils.AverageMeter()
top1 = utils.AverageMeter()
top5 = utils.AverageMeter()

model.eval()
with torch.no_grad():
for step, (data, target) in enumerate(valid_queue):
data = data.cuda()
target = target.cuda()

logits = model(data)
loss = criterion(logits, target)

prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
n = data.size(0)
objs.update(loss.item(), n)
top1.update(prec1.item(), n)
top5.update(prec5.item(), n)

if (step + 1) % args.report_freq == 0:
logging.info('valid %03d %e %f %f', step,
objs.avg, top1.avg, top5.avg)

return top1.avg, top5.avg, objs.avg


if __name__ == '__main__':
main()
Loading

0 comments on commit 34166ec

Please sign in to comment.