diff --git a/dygraph/se_resnext/README.md b/dygraph/se_resnext/README.md index 78e8870d14..02a8a787e4 100644 --- a/dygraph/se_resnext/README.md +++ b/dygraph/se_resnext/README.md @@ -27,6 +27,22 @@ env CUDA_VISIBLE_DEVICES=0 python train.py 这里`CUDA_VISIBLE_DEVICES=0`表示是执行在0号设备卡上,请根据自身情况修改这个参数。 +亦可以使用多卡进行训练: +``` +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py --use_data_parallel 1 +``` +这里`--selected_gpus=0,1,2,3`表示使用0,1,2,3号设备卡,共计4卡进行多卡训练,请根据自身情况修改这个参数。 +此时,程序会将每个进程的输出log导入到`./mylog`路径下: +``` +. +├── mylog +│ ├── workerlog.0 +│ ├── workerlog.1 +│ ├── workerlog.2 +│ └── workerlog.3 +├── README.md +└── train.py +``` ## 输出 diff --git a/dygraph/se_resnext/train.py b/dygraph/se_resnext/train.py index ab4c4511be..5c9a2e5239 100644 --- a/dygraph/se_resnext/train.py +++ b/dygraph/se_resnext/train.py @@ -26,10 +26,16 @@ import sys import math import argparse +import ast parser = argparse.ArgumentParser("Training for Se-ResNeXt.") parser.add_argument("-e", "--epoch", default=200, type=int, help="set epoch") parser.add_argument("--ce", action="store_true", help="run ce") +parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to shuffle instances in each pass.") args = parser.parse_args() batch_size = 64 train_parameters = { @@ -361,22 +367,30 @@ def train(): epoch_num = args.epoch batch_size = train_parameters["batch_size"] - with fluid.dygraph.guard(): + trainer_count = fluid.dygraph.parallel.Env().nranks + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 90 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() se_resnext = SeResNeXt("se_resnext") optimizer = optimizer_setting(train_parameters) + if args.use_data_parallel: + se_resnext = fluid.dygraph.parallel.DataParallel(se_resnext, strategy) train_reader = paddle.batch( paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size, drop_last=True ) - + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) test_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=32) @@ -407,7 +421,12 @@ def train(): acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5) dy_out = avg_loss.numpy() - avg_loss.backward() + if args.use_data_parallel: + avg_loss = se_resnext.scale_loss(avg_loss) + avg_loss.backward() + se_resnext.apply_collective_grads() + else: + avg_loss.backward() optimizer.minimize(avg_loss) se_resnext.clear_gradients() @@ -418,7 +437,6 @@ def train(): total_acc5 += acc_top5.numpy() total_sample += 1 if batch_id % 10 == 0: - print(fluid.dygraph.base._print_debug_msg()) print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f lr %0.5f" % \ ( epoch_id, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample, lr))