Skip to content

Commit

Permalink
Fix grad_clip in DARTS, grad_clip has been upgraded in Paddle2.0 (Pad…
Browse files Browse the repository at this point in the history
  • Loading branch information
baiyfbupt authored Apr 22, 2020
1 parent 388211f commit 823ca6b
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 37 deletions.
10 changes: 5 additions & 5 deletions demo/darts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ python search.py --method='PC-DARTS' --batch_size=256 --learning_rate=0.1 --arch
图1: 在CIFAR10数据集上进行搜索的模型结构变化,上半部分为reduction cell,下半部分为normal cell
</p>

使用三种搜索方法得到的结构Genotype已添加到了genotypes.py文件中,`DARTS_V1``DARTS_V2``PC-DARTS`分别代表使用DARTS一阶、二阶近似方法和PC-DARTS搜索方法得到的网络结构。
使用三种搜索方法得到的结构Genotype已添加到了genotypes.py文件中,`DARTS_V1``DARTS_V2``PC_DARTS`分别代表使用DARTS一阶、二阶近似方法和PC-DARTS搜索方法得到的网络结构。

## 网络结构评估训练

在得到搜索结构Genotype之后,可以对其进行评估训练,从而获得它在特定数据集上的真实性能

```bash
python train.py --arch='PC-DARTS' # 在CIFAR10数据集上对搜索到的结构评估训练
python train_imagenet.py --arch='PC-DARTS' # 在ImageNet数据集上对搜索得到的结构评估训练
python train.py --arch='PC_DARTS' # 在CIFAR10数据集上对搜索到的结构评估训练
python train_imagenet.py --arch='PC_DARTS' # 在ImageNet数据集上对搜索得到的结构评估训练
```

对搜索到的`DARTS_V1``DARTS_V2``PC-DARTS`做评估训练的结果如下:
Expand Down Expand Up @@ -83,7 +83,7 @@ def train_search(batch_size, train_portion, is_shuffle, args):
使用以下命令对搜索得到的Genotype结构进行可视化观察

```python
python visualize.py PC-DARTS
python visualize.py PC_DARTS
```

`PC-DARTS`代表某个Genotype结构,需要预先添加到genotype.py中
`PC_DARTS`代表某个Genotype结构,需要预先添加到genotype.py中
1 change: 1 addition & 0 deletions demo/darts/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.initializer import ConstantInitializer, MSRAInitializer
Expand Down
2 changes: 1 addition & 1 deletion demo/darts/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

# yapf: disable
add_arg('log_freq', int, 50, "Log frequency.")
add_arg('use_multiprocess', bool, True, "Whether use multiprocess reader.")
add_arg('use_multiprocess', bool, False, "Whether use multiprocess reader.")
add_arg('num_workers', int, 4, "The multiprocess reader number.")
add_arg('data', str, 'dataset/cifar10',"The dir of dataset.")
add_arg('batch_size', int, 64, "Minibatch size.")
Expand Down
28 changes: 13 additions & 15 deletions demo/darts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,24 @@
import ast
import argparse
import functools

import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)

import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from model import NetworkCIFAR as Network
from paddleslim.common import AvgrageMeter
from paddleslim.common import AvgrageMeter, get_logger

import genotypes
import reader
from model import NetworkCIFAR as Network
sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
from utility import add_arguments, print_arguments
logger = get_logger(__name__, level=logging.INFO)

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)

# yapf: disable
add_arg('use_multiprocess', bool, True, "Whether use multiprocess reader.")
add_arg('use_multiprocess', bool, False, "Whether use multiprocess reader.")
add_arg('num_workers', int, 4, "The multiprocess reader number.")
add_arg('data', str, 'dataset/cifar10',"The dir of dataset.")
add_arg('batch_size', int, 96, "Minibatch size.")
Expand All @@ -60,8 +58,8 @@
add_arg('auxiliary_weight', float, 0.4, "Weight for auxiliary loss.")
add_arg('drop_path_prob', float, 0.2, "Drop path probability.")
add_arg('grad_clip', float, 5, "Gradient clipping.")
add_arg('arch', str, 'DARTS_V2', "Which architecture to use")
add_arg('report_freq', int, 50, 'Report frequency')
add_arg('arch', str, 'DARTS_V2', "Which architecture to use")
add_arg('log_freq', int, 50, 'Report frequency')
add_arg('use_data_parallel', ast.literal_eval, False, "The flag indicating whether to use data parallel mode to train the model.")
# yapf: enable

Expand Down Expand Up @@ -95,17 +93,15 @@ def train(model, train_reader, optimizer, epoch, drop_path_prob, args):
else:
loss.backward()

grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
args.grad_clip)
optimizer.minimize(loss, grad_clip=grad_clip)
optimizer.minimize(loss)
model.clear_gradients()

n = image.shape[0]
objs.update(loss.numpy(), n)
top1.update(prec1.numpy(), n)
top5.update(prec5.numpy(), n)

if step_id % args.report_freq == 0:
if step_id % args.log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
Expand All @@ -132,7 +128,7 @@ def valid(model, valid_reader, epoch, args):
objs.update(loss.numpy(), n)
top1.update(prec1.numpy(), n)
top5.update(prec5.numpy(), n)
if step_id % args.report_freq == 0:
if step_id % args.log_freq == 0:
logger.info(
"Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
Expand All @@ -158,11 +154,13 @@ def main(args):
step_per_epoch = int(args.trainset_num / args.batch_size)
learning_rate = fluid.dygraph.CosineDecay(args.learning_rate,
step_per_epoch, args.epochs)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
momentum=args.momentum,
regularization=fluid.regularizer.L2Decay(args.weight_decay),
parameter_list=model.parameters())
parameter_list=model.parameters(),
grad_clip=clip)

if args.use_data_parallel:
model = fluid.dygraph.parallel.DataParallel(model, strategy)
Expand Down
24 changes: 11 additions & 13 deletions demo/darts/train_imagenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,17 @@
import ast
import argparse
import functools

import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)

import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from model import NetworkImageNet as Network
from paddleslim.common import AvgrageMeter
from paddleslim.common import AvgrageMeter, get_logger
import genotypes
import reader
from model import NetworkImageNet as Network
sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
from utility import add_arguments, print_arguments
logger = get_logger(__name__, level=logging.INFO)

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
Expand Down Expand Up @@ -62,7 +59,7 @@
add_arg('grad_clip', float, 5, "Gradient clipping.")
add_arg('label_smooth', float, 0.1, "Label smoothing.")
add_arg('arch', str, 'DARTS_V2', "Which architecture to use")
add_arg('report_freq', int, 100, 'Report frequency')
add_arg('log_freq', int, 100, 'Report frequency')
add_arg('use_data_parallel', ast.literal_eval, False, "The flag indicating whether to use data parallel mode to train the model.")
# yapf: enable

Expand Down Expand Up @@ -108,17 +105,15 @@ def train(model, train_reader, optimizer, epoch, args):
else:
loss.backward()

grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
args.grad_clip)
optimizer.minimize(loss, grad_clip=grad_clip)
optimizer.minimize(loss)
model.clear_gradients()

n = image.shape[0]
objs.update(loss.numpy(), n)
top1.update(prec1.numpy(), n)
top5.update(prec5.numpy(), n)

if step_id % args.report_freq == 0:
if step_id % args.log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
Expand All @@ -145,7 +140,7 @@ def valid(model, valid_reader, epoch, args):
objs.update(loss.numpy(), n)
top1.update(prec1.numpy(), n)
top5.update(prec5.numpy(), n)
if step_id % args.report_freq == 0:
if step_id % args.log_freq == 0:
logger.info(
"Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
Expand Down Expand Up @@ -174,11 +169,14 @@ def main(args):
step_per_epoch,
args.decay_rate,
staircase=True)

clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
momentum=args.momentum,
regularization=fluid.regularizer.L2Decay(args.weight_decay),
parameter_list=model.parameters())
parameter_list=model.parameters(),
grad_clip=clip)

if args.use_data_parallel:
model = fluid.dygraph.parallel.DataParallel(model, strategy)
Expand Down
8 changes: 5 additions & 3 deletions paddleslim/nas/darts/train_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ def train_one_epoch(self, train_loader, valid_loader, architect, optimizer,
else:
loss.backward()

grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5)
optimizer.minimize(loss, grad_clip)
optimizer.minimize(loss)
self.model.clear_gradients()

objs.update(loss.numpy(), n)
Expand Down Expand Up @@ -163,11 +162,14 @@ def train(self):
step_per_epoch *= 2
learning_rate = fluid.dygraph.CosineDecay(
self.learning_rate, step_per_epoch, self.num_epochs)

clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
0.9,
regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
parameter_list=model_parameters)
parameter_list=model_parameters,
grad_clip=clip)

if self.use_data_parallel:
self.model = fluid.dygraph.parallel.DataParallel(self.model,
Expand Down

0 comments on commit 823ca6b

Please sign in to comment.