-
Notifications
You must be signed in to change notification settings - Fork 57
/
train.py
152 lines (124 loc) · 5.82 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Train Real NVP on CIFAR-10.
Train script adapted from: https://github.com/kuangliu/pytorch-cifar/
"""
import argparse
import os
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import util
from models import RealNVP, RealNVPLoss
from tqdm import tqdm
def main(args):
device = 'cuda' if torch.cuda.is_available() and len(args.gpu_ids) > 0 else 'cpu'
start_epoch = 0
# Note: No normalization applied, since RealNVP expects inputs in (0, 1).
transform_train = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ToTensor()
])
transform_test = transforms.Compose([
transforms.ToTensor()
])
trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform_train)
trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)
testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test)
testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
# Model
print('Building model..')
net = RealNVP(num_scales=2, in_channels=3, mid_channels=64, num_blocks=8)
net = net.to(device)
if device == 'cuda':
net = torch.nn.DataParallel(net, args.gpu_ids)
cudnn.benchmark = args.benchmark
if args.resume:
# Load checkpoint.
print('Resuming from checkpoint at ckpts/best.pth.tar...')
assert os.path.isdir('ckpts'), 'Error: no checkpoint directory found!'
checkpoint = torch.load('ckpts/best.pth.tar')
net.load_state_dict(checkpoint['net'])
global best_loss
best_loss = checkpoint['test_loss']
start_epoch = checkpoint['epoch']
loss_fn = RealNVPLoss()
param_groups = util.get_param_groups(net, args.weight_decay, norm_suffix='weight_g')
optimizer = optim.Adam(param_groups, lr=args.lr)
for epoch in range(start_epoch, start_epoch + args.num_epochs):
train(epoch, net, trainloader, device, optimizer, loss_fn, args.max_grad_norm)
test(epoch, net, testloader, device, loss_fn, args.num_samples)
def train(epoch, net, trainloader, device, optimizer, loss_fn, max_grad_norm):
print('\nEpoch: %d' % epoch)
net.train()
loss_meter = util.AverageMeter()
with tqdm(total=len(trainloader.dataset)) as progress_bar:
for x, _ in trainloader:
x = x.to(device)
optimizer.zero_grad()
z, sldj = net(x, reverse=False)
loss = loss_fn(z, sldj)
loss_meter.update(loss.item(), x.size(0))
loss.backward()
util.clip_grad_norm(optimizer, max_grad_norm)
optimizer.step()
progress_bar.set_postfix(loss=loss_meter.avg,
bpd=util.bits_per_dim(x, loss_meter.avg))
progress_bar.update(x.size(0))
def sample(net, batch_size, device):
"""Sample from RealNVP model.
Args:
net (torch.nn.DataParallel): The RealNVP model wrapped in DataParallel.
batch_size (int): Number of samples to generate.
device (torch.device): Device to use.
"""
z = torch.randn((batch_size, 3, 32, 32), dtype=torch.float32, device=device)
x, _ = net(z, reverse=True)
x = torch.sigmoid(x)
return x
def test(epoch, net, testloader, device, loss_fn, num_samples):
global best_loss
net.eval()
loss_meter = util.AverageMeter()
with torch.no_grad():
with tqdm(total=len(testloader.dataset)) as progress_bar:
for x, _ in testloader:
x = x.to(device)
z, sldj = net(x, reverse=False)
loss = loss_fn(z, sldj)
loss_meter.update(loss.item(), x.size(0))
progress_bar.set_postfix(loss=loss_meter.avg,
bpd=util.bits_per_dim(x, loss_meter.avg))
progress_bar.update(x.size(0))
# Save checkpoint
if loss_meter.avg < best_loss:
print('Saving...')
state = {
'net': net.state_dict(),
'test_loss': loss_meter.avg,
'epoch': epoch,
}
os.makedirs('ckpts', exist_ok=True)
torch.save(state, 'ckpts/best.pth.tar')
best_loss = loss_meter.avg
# Save samples and data
images = sample(net, num_samples, device)
os.makedirs('samples', exist_ok=True)
images_concat = torchvision.utils.make_grid(images, nrow=int(num_samples ** 0.5), padding=2, pad_value=255)
torchvision.utils.save_image(images_concat, 'samples/epoch_{}.png'.format(epoch))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='RealNVP on CIFAR-10')
parser.add_argument('--batch_size', default=64, type=int, help='Batch size')
parser.add_argument('--benchmark', action='store_true', help='Turn on CUDNN benchmarking')
parser.add_argument('--gpu_ids', default='[0]', type=eval, help='IDs of GPUs to use')
parser.add_argument('--lr', default=1e-3, type=float, help='Learning rate')
parser.add_argument('--max_grad_norm', type=float, default=100., help='Max gradient norm for clipping')
parser.add_argument('--num_epochs', default=100, type=int, help='Number of epochs to train')
parser.add_argument('--num_samples', default=64, type=int, help='Number of samples at test time')
parser.add_argument('--num_workers', default=8, type=int, help='Number of data loader threads')
parser.add_argument('--resume', '-r', action='store_true', help='Resume from checkpoint')
parser.add_argument('--weight_decay', default=5e-5, type=float,
help='L2 regularization (only applied to the weight norm scale factors)')
best_loss = 0
main(parser.parse_args())