diff --git a/main_lincls.py b/main_lincls.py
index 2fcfb4b..333bd4a 100755
--- a/main_lincls.py
+++ b/main_lincls.py
@@ -53,7 +53,7 @@
 parser.add_argument('-b', '--batch-size', default=1024, type=int,
                     metavar='N',
                     help='mini-batch size (default: 1024), this is the total '
-                         'batch size of all GPUs on the current node when '
+                         'batch size of all GPUs on all nodes when '
                          'using Data Parallel or Distributed Data Parallel')
 parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                     metavar='LR', help='initial (base) learning rate', dest='lr')
@@ -207,7 +207,7 @@ def print_pass(*args):
             # When using a single GPU per process and per
             # DistributedDataParallel, we need to divide the batch size
             # ourselves based on the total number of GPUs we have
-            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.batch_size = int(args.batch_size / args.world_size)
             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
         else:
diff --git a/main_moco.py b/main_moco.py
index f510ff4..97707c1 100755
--- a/main_moco.py
+++ b/main_moco.py
@@ -60,7 +60,7 @@
 parser.add_argument('-b', '--batch-size', default=4096, type=int,
                     metavar='N',
                     help='mini-batch size (default: 4096), this is the total '
-                         'batch size of all GPUs on the current node when '
+                         'batch size of all GPUs on all nodes when '
                          'using Data Parallel or Distributed Data Parallel')
 parser.add_argument('--lr', '--learning-rate', default=0.6, type=float,
                     metavar='LR', help='initial (base) learning rate', dest='lr')