Merge pull request NVIDIA#676 from NVIDIA/gh/release

[DLRM/PyT] Triton updates
1024er · Sep 8, 2020 · 323005c · 323005c
2 parents 8588e98 + 21fcdd6
commit 323005c
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 69 deletions.
diff --git a/PyTorch/Recommendation/DLRM/dlrm/data/datasets.py b/PyTorch/Recommendation/DLRM/dlrm/data/datasets.py
@@ -144,7 +144,8 @@ def __init__(
         numerical_features: bool = False,
         categorical_features: Optional[Sequence[int]] = None,
         categorical_feature_sizes: Optional[Sequence[int]] = None,
-        prefetch_depth: int = 10
+        prefetch_depth: int = 10,
+        drop_last_batch: bool = False,
     ):
         self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size
         self._numerical_bytes_per_batch = 13 * np.dtype(np.float16).itemsize * batch_size if numerical_features else 0
@@ -156,25 +157,31 @@ def __init__(
         ]
         self._categorical_features = categorical_features
         self._batch_size = batch_size
-        self._label_file = os.open(os.path.join(data_path, F"label.bin"), os.O_RDONLY)
-        self._num_entries = int(math.ceil(os.fstat(self._label_file).st_size / self._label_bytes_per_batch))
+        self._label_file = os.open(os.path.join(data_path, f"label.bin"), os.O_RDONLY)
+        self._num_entries = int(math.ceil(os.fstat(self._label_file).st_size
+                                          / self._label_bytes_per_batch)) if not drop_last_batch \
+                            else int(math.floor(os.fstat(self._label_file).st_size / self._label_bytes_per_batch))
 
         if numerical_features:
             self._numerical_features_file = os.open(os.path.join(data_path, "numerical.bin"), os.O_RDONLY)
-            if math.ceil(os.fstat(self._numerical_features_file).st_size /
-                         self._numerical_bytes_per_batch) != self._num_entries:
-                raise ValueError("Size miss match in data files")
+            number_of_numerical_batches = math.ceil(os.fstat(self._numerical_features_file).st_size
+                                                    / self._numerical_bytes_per_batch) if not drop_last_batch \
+                                          else math.floor(os.fstat(self._numerical_features_file).st_size
+                                                          / self._numerical_bytes_per_batch)
+            if number_of_numerical_batches != self._num_entries:
+                raise ValueError("Size mismatch in data files")
         else:
             self._numerical_features_file = None
 
         if categorical_features:
             self._categorical_features_files = []
             for cat_id in categorical_features:
-                cat_file = os.open(os.path.join(data_path, F"cat_{cat_id}.bin"), os.O_RDONLY)
+                cat_file = os.open(os.path.join(data_path, f"cat_{cat_id}.bin"), os.O_RDONLY)
                 cat_bytes = self._categorical_bytes_per_batch[cat_id]
-                if math.ceil(
-                        os.fstat(cat_file).st_size / cat_bytes) != self._num_entries:
-                    raise ValueError("Size miss match in data files")
+                number_of_categorical_batches = math.ceil(os.fstat(cat_file).st_size / cat_bytes) if not drop_last_batch \
+                                                else math.floor(os.fstat(cat_file).st_size / cat_bytes)
+                if number_of_categorical_batches != self._num_entries:
+                    raise ValueError("Size mismatch in data files")
                 self._categorical_features_files.append(cat_file)
         else:
             self._categorical_features_files = None

diff --git a/PyTorch/Recommendation/DLRM/dlrm/nn/embeddings.py b/PyTorch/Recommendation/DLRM/dlrm/nn/embeddings.py
@@ -131,12 +131,12 @@ def forward(self, categorical_inputs) -> List[torch.Tensor]:
         if self.hash_indices:
             for cat, size in enumerate(self._categorical_feature_sizes):
                 categorical_inputs[:, cat] %= size
-                logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
+                logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)
 
         return [self.embedding(categorical_inputs + self.offsets[:-1])]
 
     def extra_repr(self):
-        s = F"offsets={self.offsets.cpu().numpy()}"
+        s = f"offsets={self.offsets.cpu().numpy()}"
         return s
     # pylint:enable=missing-docstring
 
@@ -189,7 +189,7 @@ def forward(self, categorical_inputs) -> List[torch.Tensor]:
         if self.hash_indices:
             for cat, size in enumerate(self._categorical_feature_sizes):
                 categorical_inputs[:, cat] %= size
-                logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
+                logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)
 
         return [BuckleEmbeddingFusedGatherFunction.apply(self.weight, categorical_inputs, self.offsets, self.amp_train)]
 
@@ -228,7 +228,7 @@ def forward(self, categorical_inputs) -> List[torch.Tensor]:
         if self.hash_indices:
             for cat, size in enumerate(self._categorical_feature_sizes):
                 categorical_inputs[:, cat] %= size
-                logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
+                logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)
 
         return [
             self.embedding(categorical_inputs)

diff --git a/PyTorch/Recommendation/DLRM/dlrm/scripts/dist_main.py b/PyTorch/Recommendation/DLRM/dlrm/scripts/dist_main.py
@@ -152,7 +152,7 @@ def main(argv):
         dllogger.log(data=results, step=tuple())
 
         if auc is not None:
-            print(F"Finished testing. Test auc {auc:.4f}")
+            print(f"Finished testing. Test auc {auc:.4f}")
         return
 
     if FLAGS.save_checkpoint_path and not FLAGS.bottom_features_ordered and is_main_process():
@@ -209,7 +209,7 @@ def main(argv):
             global_step = steps_per_epoch * epoch + step
 
             if FLAGS.max_steps and global_step > FLAGS.max_steps:
-                print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
+                print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
                 break
 
             lr_scheduler.step()
@@ -245,7 +245,7 @@ def main(argv):
                 continue
 
             if step == 0:
-                print(F"Started epoch {epoch}...")
+                print(f"Started epoch {epoch}...")
             elif step % print_freq == 0:
                 torch.cuda.current_stream().wait_stream(moving_loss_stream)
                 # Averaging cross a print_freq period to reduce the error.
@@ -264,7 +264,7 @@ def main(argv):
 
                 eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
                 metric_logger.print(
-                    header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
+                    header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
 
                 with torch.cuda.stream(moving_loss_stream):
                     moving_loss = 0.
@@ -275,7 +275,7 @@ def main(argv):
                 if auc is None:
                     continue
 
-                print(F"Epoch {epoch} step {step}. auc {auc:.6f}")
+                print(f"Epoch {epoch} step {step}. auc {auc:.6f}")
                 stop_time = time()
 
                 if auc > best_auc:
@@ -284,15 +284,15 @@ def main(argv):
 
                 if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                     run_time_s = int(stop_time - start_time)
-                    print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
-                          F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
-                          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+                    print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
+                          f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
+                          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
                     sys.exit()
 
         epoch_stop_time = time()
         epoch_time_s = epoch_stop_time - epoch_start_time
-        print(F"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
-              F"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s.")
+        print(f"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
+              f"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s.")
 
     avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
 
@@ -383,7 +383,7 @@ def dist_evaluate(model, data_loader):
             if timer.measured is not None:
                 metric_logger.update(step_time=timer.measured)
                 if step % print_freq == 0 and step > 0:
-                    metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
+                    metric_logger.print(header=f"Test: [{step}/{steps_per_epoch}]")
 
         if is_main_process():
             auc = utils.roc_auc_score(torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float()))

diff --git a/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py b/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
@@ -204,7 +204,7 @@ def main(argv):
                    'average_test_throughput': avg_test_throughput}
         dllogger.log(data=results, step=tuple())
 
-        print(F"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
+        print(f"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
         return
 
     if FLAGS.mode == 'inference_benchmark':
@@ -227,12 +227,12 @@ def main(argv):
 
             mean_latency = np.mean(latencies)
             mean_inference_throughput = batch_size / mean_latency
-            subresult = {F'mean_inference_latency_batch_{batch_size}': mean_latency,
-                         F'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
+            subresult = {f'mean_inference_latency_batch_{batch_size}': mean_latency,
+                         f'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
             results.update(subresult)
         dllogger.log(data=results, step=tuple())
 
-        print(F"Finished inference benchmark.")
+        print(f"Finished inference benchmark.")
         return
 
     if FLAGS.mode == 'train':
@@ -305,7 +305,7 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                           decay_steps=FLAGS.decay_steps, decay_start_step=FLAGS.decay_start_step)
 
             if FLAGS.max_steps and global_step > FLAGS.max_steps:
-                print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
+                print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
                 break
 
             if prefetching_enabled:
@@ -346,17 +346,17 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                     )
 
                 if global_step < FLAGS.benchmark_warmup_steps:
-                    print(F'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
+                    print(f'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
                     continue
 
                 eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
                 metric_logger.print(
-                    header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
+                    header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
 
             if (global_step % test_freq == 0 and global_step > 0 and
                     global_step / steps_per_epoch >= FLAGS.test_after):
                 loss, auc, test_step_time = evaluate(model, loss_fn, data_loader_test)
-                print(F"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")
+                print(f"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")
 
                 if auc > best_auc:
                     best_auc = auc
@@ -366,16 +366,16 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                 if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                     stop_time = time()
                     run_time_s = int(stop_time - start_time)
-                    print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
-                          F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
-                          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+                    print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
+                          f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
+                          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
                     return
 
     stop_time = time()
     run_time_s = int(stop_time - start_time)
 
-    print(F"Finished training in {run_time_s}s. "
-          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+    print(f"Finished training in {run_time_s}s. "
+          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
 
     avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
 
@@ -441,7 +441,7 @@ def evaluate(model, loss_fn, data_loader):
             if timer.measured is not None:
                 metric_logger.update(loss=loss_value, step_time=timer.measured)
                 if step % print_freq == 0 and step > 0:
-                    metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
+                    metric_logger.print(header=f"Test: [{step}/{steps_per_epoch}]")
 
         y_true = torch.cat(y_true)
         y_score = torch.cat(y_score)

diff --git a/PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py b/PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
@@ -127,7 +127,7 @@ def print(self, header=None):
             header = ''
         print_str = header
         for name, meter in self.meters.items():
-            print_str += F"  {name}: {meter}"
+            print_str += f"  {name}: {meter}"
         print(print_str)
 
 
@@ -282,13 +282,13 @@ def roc_auc_score(y_true, y_score):
     y_true.squeeze_()
     y_score.squeeze_()
     if y_true.shape != y_score.shape:
-        raise TypeError(F"Shape of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.")
+        raise TypeError(f"Shape of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.")
 
     desc_score_indices = torch.argsort(y_score, descending=True)
     y_score = y_score[desc_score_indices]
     y_true = y_true[desc_score_indices]
 
-    distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1]).squeeze()
+    distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1], as_tuple=False).squeeze()
     threshold_idxs = torch.cat([distinct_value_indices, torch.tensor([y_true.numel() - 1], device=device)])
 
     tps = torch.cumsum(y_true, dim=0)[threshold_idxs]

diff --git a/PyTorch/Recommendation/DLRM/preproc/split_dataset.py b/PyTorch/Recommendation/DLRM/preproc/split_dataset.py
@@ -63,7 +63,7 @@ def split_binary_file(
 
         categorical_fs = []
         for i in range(len(categorical_feature_sizes)):
-            fs = open(os.path.join(output_dir, F'cat_{i}.bin'), 'wb+')
+            fs = open(os.path.join(output_dir, f'cat_{i}.bin'), 'wb+')
             categorical_fs.append(fs)
             file_streams.append(fs)