Skip to content

Commit

Permalink
ImageNet, distributed bug fixes (pytorch#462)
Browse files Browse the repository at this point in the history
  • Loading branch information
teng-li authored and soumith committed Dec 5, 2018
1 parent 91f230a commit 15e2771
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=1, type=int,
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=0, type=int,
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
Expand Down Expand Up @@ -95,6 +95,9 @@ def main():
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')

if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])

args.distributed = args.world_size > 1 or args.multiprocessing_distributed

ngpus_per_node = torch.cuda.device_count()
Expand All @@ -118,6 +121,8 @@ def main_worker(gpu, ngpus_per_node, args):
print("Use GPU: {} for training".format(args.gpu))

if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
Expand Down Expand Up @@ -235,13 +240,16 @@ def main_worker(gpu, ngpus_per_node, args):
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, is_best)

if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, is_best)


def train(train_loader, model, criterion, optimizer, epoch, args):
Expand Down

0 comments on commit 15e2771

Please sign in to comment.