Skip to content

Commit

Permalink
add dygraph mnist CE (#2453)
Browse files Browse the repository at this point in the history
* add ce for dygraph mnist

* add ce for dygraph mnist

* del mnist_dygraph.py

* change mnist_dygraph to train

* fix print style
  • Loading branch information
DDDivano authored Jun 20, 2019
1 parent dbc27b8 commit 83b367d
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 3 deletions.
8 changes: 8 additions & 0 deletions dygraph/mnist/.run_ce.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

# This file is only used for continuous evaluation.
# dygraph single card
export FLAGS_cudnn_deterministic=True
export CUDA_VISIBLE_DEVICES=0
python train.py --ce --epoch 1 | python _ce.py

4 changes: 2 additions & 2 deletions dygraph/mnist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
## 训练
教程中使用`paddle.dataset.mnist`数据集作为训练数据,可以通过如下的方式启动训练:
```
env CUDA_VISIBLE_DEVICES=0 python mnist_dygraph.py
env CUDA_VISIBLE_DEVICES=0 python train.py
```
Paddle动态图支持多进程多卡进行模型训练,启动训练的方式:
```
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog mnist_dygraph.py --use_data_parallel 1
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py --use_data_parallel 1
```
此时,程序会将每个进程的输出log导入到`./mylog`路径下:
```
Expand Down
65 changes: 65 additions & 0 deletions dygraph/mnist/_ce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
####this file is only used for continuous evaluation test!
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi

#### NOTE kpi.py should shared in models in some way!!!!

test_acc = AccKpi('test_acc', 0.001, 0, actived=True, desc="test acc")
test_cost = CostKpi('test_cost', 0.001, 0, actived=True, desc='test cost')
#train_speed_kpi = DurationKpi(
# 'train_speed',
# 0.05,
# 0,
# actived=True,
# unit_repr='seconds/image',
# desc='train speed in one GPU card')
tracking_kpis = [test_acc, test_cost]

def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value


def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi

for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()


if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
14 changes: 13 additions & 1 deletion dygraph/mnist/mnist_dygraph.py → dygraph/mnist/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def parse_args():
type=ast.literal_eval,
default=False,
help="The flag indicating whether to shuffle instances in each pass.")
parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch")
parser.add_argument("--ce", action="store_true", help="run ce")
args = parser.parse_args()
return args

Expand Down Expand Up @@ -170,13 +172,20 @@ def load_image(file):


def train_mnist(args):
epoch_num = 5
epoch_num = args.epoch
BATCH_SIZE = 64

trainer_count = fluid.dygraph.parallel.Env().nranks
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
if args.use_data_parallel else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed

if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
mnist = MNIST("mnist")
Expand Down Expand Up @@ -226,6 +235,9 @@ def train_mnist(args):
mnist.eval()
test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE)
mnist.train()
if args.ce:
print("kpis\ttest_acc\t%s" % test_acc)
print("kpis\ttest_cost\t%s" % test_cost)
print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format(
epoch, test_cost, test_acc))

Expand Down

0 comments on commit 83b367d

Please sign in to comment.