Skip to content

Commit

Permalink
Merge pull request NVIDIA#676 from NVIDIA/gh/release
Browse files Browse the repository at this point in the history
[DLRM/PyT] Triton updates
  • Loading branch information
nv-kkudrynski authored Sep 8, 2020
2 parents 8588e98 + 21fcdd6 commit 323005c
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 69 deletions.
27 changes: 17 additions & 10 deletions PyTorch/Recommendation/DLRM/dlrm/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ def __init__(
numerical_features: bool = False,
categorical_features: Optional[Sequence[int]] = None,
categorical_feature_sizes: Optional[Sequence[int]] = None,
prefetch_depth: int = 10
prefetch_depth: int = 10,
drop_last_batch: bool = False,
):
self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size
self._numerical_bytes_per_batch = 13 * np.dtype(np.float16).itemsize * batch_size if numerical_features else 0
Expand All @@ -156,25 +157,31 @@ def __init__(
]
self._categorical_features = categorical_features
self._batch_size = batch_size
self._label_file = os.open(os.path.join(data_path, F"label.bin"), os.O_RDONLY)
self._num_entries = int(math.ceil(os.fstat(self._label_file).st_size / self._label_bytes_per_batch))
self._label_file = os.open(os.path.join(data_path, f"label.bin"), os.O_RDONLY)
self._num_entries = int(math.ceil(os.fstat(self._label_file).st_size
/ self._label_bytes_per_batch)) if not drop_last_batch \
else int(math.floor(os.fstat(self._label_file).st_size / self._label_bytes_per_batch))

if numerical_features:
self._numerical_features_file = os.open(os.path.join(data_path, "numerical.bin"), os.O_RDONLY)
if math.ceil(os.fstat(self._numerical_features_file).st_size /
self._numerical_bytes_per_batch) != self._num_entries:
raise ValueError("Size miss match in data files")
number_of_numerical_batches = math.ceil(os.fstat(self._numerical_features_file).st_size
/ self._numerical_bytes_per_batch) if not drop_last_batch \
else math.floor(os.fstat(self._numerical_features_file).st_size
/ self._numerical_bytes_per_batch)
if number_of_numerical_batches != self._num_entries:
raise ValueError("Size mismatch in data files")
else:
self._numerical_features_file = None

if categorical_features:
self._categorical_features_files = []
for cat_id in categorical_features:
cat_file = os.open(os.path.join(data_path, F"cat_{cat_id}.bin"), os.O_RDONLY)
cat_file = os.open(os.path.join(data_path, f"cat_{cat_id}.bin"), os.O_RDONLY)
cat_bytes = self._categorical_bytes_per_batch[cat_id]
if math.ceil(
os.fstat(cat_file).st_size / cat_bytes) != self._num_entries:
raise ValueError("Size miss match in data files")
number_of_categorical_batches = math.ceil(os.fstat(cat_file).st_size / cat_bytes) if not drop_last_batch \
else math.floor(os.fstat(cat_file).st_size / cat_bytes)
if number_of_categorical_batches != self._num_entries:
raise ValueError("Size mismatch in data files")
self._categorical_features_files.append(cat_file)
else:
self._categorical_features_files = None
Expand Down
8 changes: 4 additions & 4 deletions PyTorch/Recommendation/DLRM/dlrm/nn/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,12 @@ def forward(self, categorical_inputs) -> List[torch.Tensor]:
if self.hash_indices:
for cat, size in enumerate(self._categorical_feature_sizes):
categorical_inputs[:, cat] %= size
logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)

return [self.embedding(categorical_inputs + self.offsets[:-1])]

def extra_repr(self):
s = F"offsets={self.offsets.cpu().numpy()}"
s = f"offsets={self.offsets.cpu().numpy()}"
return s
# pylint:enable=missing-docstring

Expand Down Expand Up @@ -189,7 +189,7 @@ def forward(self, categorical_inputs) -> List[torch.Tensor]:
if self.hash_indices:
for cat, size in enumerate(self._categorical_feature_sizes):
categorical_inputs[:, cat] %= size
logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)

return [BuckleEmbeddingFusedGatherFunction.apply(self.weight, categorical_inputs, self.offsets, self.amp_train)]

Expand Down Expand Up @@ -228,7 +228,7 @@ def forward(self, categorical_inputs) -> List[torch.Tensor]:
if self.hash_indices:
for cat, size in enumerate(self._categorical_feature_sizes):
categorical_inputs[:, cat] %= size
logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)

return [
self.embedding(categorical_inputs)
Expand Down
22 changes: 11 additions & 11 deletions PyTorch/Recommendation/DLRM/dlrm/scripts/dist_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def main(argv):
dllogger.log(data=results, step=tuple())

if auc is not None:
print(F"Finished testing. Test auc {auc:.4f}")
print(f"Finished testing. Test auc {auc:.4f}")
return

if FLAGS.save_checkpoint_path and not FLAGS.bottom_features_ordered and is_main_process():
Expand Down Expand Up @@ -209,7 +209,7 @@ def main(argv):
global_step = steps_per_epoch * epoch + step

if FLAGS.max_steps and global_step > FLAGS.max_steps:
print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
break

lr_scheduler.step()
Expand Down Expand Up @@ -245,7 +245,7 @@ def main(argv):
continue

if step == 0:
print(F"Started epoch {epoch}...")
print(f"Started epoch {epoch}...")
elif step % print_freq == 0:
torch.cuda.current_stream().wait_stream(moving_loss_stream)
# Averaging cross a print_freq period to reduce the error.
Expand All @@ -264,7 +264,7 @@ def main(argv):

eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
metric_logger.print(
header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}] eta: {eta_str}")
header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}] eta: {eta_str}")

with torch.cuda.stream(moving_loss_stream):
moving_loss = 0.
Expand All @@ -275,7 +275,7 @@ def main(argv):
if auc is None:
continue

print(F"Epoch {epoch} step {step}. auc {auc:.6f}")
print(f"Epoch {epoch} step {step}. auc {auc:.6f}")
stop_time = time()

if auc > best_auc:
Expand All @@ -284,15 +284,15 @@ def main(argv):

if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
run_time_s = int(stop_time - start_time)
print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
sys.exit()

epoch_stop_time = time()
epoch_time_s = epoch_stop_time - epoch_start_time
print(F"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
F"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s.")
print(f"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
f"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s.")

avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg

Expand Down Expand Up @@ -383,7 +383,7 @@ def dist_evaluate(model, data_loader):
if timer.measured is not None:
metric_logger.update(step_time=timer.measured)
if step % print_freq == 0 and step > 0:
metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
metric_logger.print(header=f"Test: [{step}/{steps_per_epoch}]")

if is_main_process():
auc = utils.roc_auc_score(torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float()))
Expand Down
28 changes: 14 additions & 14 deletions PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def main(argv):
'average_test_throughput': avg_test_throughput}
dllogger.log(data=results, step=tuple())

print(F"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
print(f"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
return

if FLAGS.mode == 'inference_benchmark':
Expand All @@ -227,12 +227,12 @@ def main(argv):

mean_latency = np.mean(latencies)
mean_inference_throughput = batch_size / mean_latency
subresult = {F'mean_inference_latency_batch_{batch_size}': mean_latency,
F'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
subresult = {f'mean_inference_latency_batch_{batch_size}': mean_latency,
f'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
results.update(subresult)
dllogger.log(data=results, step=tuple())

print(F"Finished inference benchmark.")
print(f"Finished inference benchmark.")
return

if FLAGS.mode == 'train':
Expand Down Expand Up @@ -305,7 +305,7 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
decay_steps=FLAGS.decay_steps, decay_start_step=FLAGS.decay_start_step)

if FLAGS.max_steps and global_step > FLAGS.max_steps:
print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
break

if prefetching_enabled:
Expand Down Expand Up @@ -346,17 +346,17 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
)

if global_step < FLAGS.benchmark_warmup_steps:
print(F'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
print(f'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
continue

eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
metric_logger.print(
header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}] eta: {eta_str}")
header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}] eta: {eta_str}")

if (global_step % test_freq == 0 and global_step > 0 and
global_step / steps_per_epoch >= FLAGS.test_after):
loss, auc, test_step_time = evaluate(model, loss_fn, data_loader_test)
print(F"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")
print(f"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")

if auc > best_auc:
best_auc = auc
Expand All @@ -366,16 +366,16 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
stop_time = time()
run_time_s = int(stop_time - start_time)
print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
return

stop_time = time()
run_time_s = int(stop_time - start_time)

print(F"Finished training in {run_time_s}s. "
F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
print(f"Finished training in {run_time_s}s. "
f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")

avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg

Expand Down Expand Up @@ -441,7 +441,7 @@ def evaluate(model, loss_fn, data_loader):
if timer.measured is not None:
metric_logger.update(loss=loss_value, step_time=timer.measured)
if step % print_freq == 0 and step > 0:
metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
metric_logger.print(header=f"Test: [{step}/{steps_per_epoch}]")

y_true = torch.cat(y_true)
y_score = torch.cat(y_score)
Expand Down
6 changes: 3 additions & 3 deletions PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def print(self, header=None):
header = ''
print_str = header
for name, meter in self.meters.items():
print_str += F" {name}: {meter}"
print_str += f" {name}: {meter}"
print(print_str)


Expand Down Expand Up @@ -282,13 +282,13 @@ def roc_auc_score(y_true, y_score):
y_true.squeeze_()
y_score.squeeze_()
if y_true.shape != y_score.shape:
raise TypeError(F"Shape of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.")
raise TypeError(f"Shape of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.")

desc_score_indices = torch.argsort(y_score, descending=True)
y_score = y_score[desc_score_indices]
y_true = y_true[desc_score_indices]

distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1]).squeeze()
distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1], as_tuple=False).squeeze()
threshold_idxs = torch.cat([distinct_value_indices, torch.tensor([y_true.numel() - 1], device=device)])

tps = torch.cumsum(y_true, dim=0)[threshold_idxs]
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/Recommendation/DLRM/preproc/split_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def split_binary_file(

categorical_fs = []
for i in range(len(categorical_feature_sizes)):
fs = open(os.path.join(output_dir, F'cat_{i}.bin'), 'wb+')
fs = open(os.path.join(output_dir, f'cat_{i}.bin'), 'wb+')
categorical_fs.append(fs)
file_streams.append(fs)

Expand Down
Loading

0 comments on commit 323005c

Please sign in to comment.