Skip to content

Commit

Permalink
correct use of contiguous in cpu embedding ops (pytorch#588)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#588

contiguous() doesn't mutate and returns a new tensor. Minimize contiguous calls by using TensorAccessor in csr2csc

Reviewed By: jianyuh

Differential Revision: D27560846

fbshipit-source-id: 87f244662e230cfc19a7ea813d0291c9379a0886
  • Loading branch information
jspark1105 authored and facebook-github-bot committed Apr 5, 2021
1 parent c109fc2 commit 4c43051
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 30 deletions.
12 changes: 5 additions & 7 deletions fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ void split_embedding_backward_exact_cpu_kernel(
batched_csc,
num_tables,
B,
offsets.data_ptr<int64_t>(),
indices.data_ptr<int64_t>(),
indice_weights.defined() ? indice_weights.data_ptr<grad_t>() : nullptr,
offsets.accessor<int64_t, 1>(),
indices.accessor<int64_t, 1>(),
indice_weights.defined()
? indice_weights.accessor<grad_t, 1>()
: TensorAccessor<grad_t, 1>(nullptr, nullptr, nullptr),
pooling_mode,
table_to_feature_offset);
std::vector<int>& table_ptr = batched_csc.table_ptr;
Expand Down Expand Up @@ -198,10 +200,6 @@ void split_embedding_backward_exact_cpu_dense_kernel(
const auto momentum2_offsets_data = momentum2_offsets.accessor<int64_t, 1>();
{% endif %}

offsets.contiguous();
indices.contiguous();
indice_weights.contiguous();

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
host_weights.scalar_type(), "split_embedding_backward_exact_cpu", [&]() {
split_embedding_backward_exact_cpu_kernel<scalar_t>(
Expand Down
39 changes: 19 additions & 20 deletions fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,18 @@ void split_embedding_forward_cpu_kernel(
int64_t B = (offsets.size(0) - 1) / T;
TORCH_CHECK(B > 0);

offsets.contiguous();
indices.contiguous();
weights.contiguous();
TORCH_CHECK(weights.is_contiguous());
indices = indices.contiguous();
offsets = offsets.contiguous();
if (indice_weights.defined()) {
indice_weights.contiguous();
indice_weights = indice_weights.contiguous();
}

const auto D_offsets_data = D_offsets.accessor<int, 1>();
const auto weights_offsets_data = weights_offsets.accessor<int64_t, 1>();
const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
const auto offsets_data = offsets.data_ptr<int64_t>();
const auto indices_data = indices.data_ptr<int64_t>();
const auto offsets_data = offsets.data_ptr<int64_t>();
const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();

const auto weights_data = weights.data_ptr<weights_t>();
// If indice_weights not defined, then this accessor won't be used.
Expand Down Expand Up @@ -344,16 +344,17 @@ void batched_csr2csc(
int num_tables, // number of tables, not number of features
int B,
// TODO: use accessor for the following 3 parameters
const int64_t* batched_csr_offsets,
const int64_t* batched_csr_indices,
const scalar_t* batched_csr_weights,
const TensorAccessor<int64_t, 1>& batched_csr_offsets,
const TensorAccessor<int64_t, 1>& batched_csr_indices,
const TensorAccessor<scalar_t, 1>& batched_csr_weights,
int64_t pooling_mode,
const int* table_to_feature_offset) {
batched_csc.num_tables = num_tables;
batched_csc.table_ptr.resize(num_tables + 1);
int64_t nnz = batched_csr_offsets[table_to_feature_offset[num_tables] * B];
batched_csc.row_indices.resize(nnz);
if (batched_csr_weights || pooling_mode == MEAN) {
bool has_weights = batched_csr_weights.data() != nullptr;
if (has_weights || pooling_mode == MEAN) {
batched_csc.weights.resize(nnz);
}

Expand All @@ -372,13 +373,11 @@ void batched_csr2csc(
int64_t L = pool_end - pool_begin;
// MEAN pooling will not work with indice_weights!
double scale_factor =
(pooling_mode == MEAN && !batched_csr_weights && L > 0) ? 1.0 / L
: 1.0;
(pooling_mode == MEAN && !has_weights && L > 0) ? 1.0 / L : 1.0;
for (int64_t p = pool_begin; p < pool_end; ++p) {
non_empty_columns[batched_csr_indices[p]].emplace_back(
feature * B + b,
scale_factor *
(batched_csr_weights ? batched_csr_weights[p] : 1.0f));
scale_factor * (has_weights ? batched_csr_weights[p] : 1.0f));
}
}
} // for each feature
Expand Down Expand Up @@ -408,19 +407,19 @@ template void batched_csr2csc<float>(
BatchedHyperCompressedSparseColumn& batched_csc,
int T,
int B,
const int64_t* batched_csr_offsets,
const int64_t* batched_csr_indices,
const float* batched_csr_weights,
const TensorAccessor<int64_t, 1>& batched_csr_offsets,
const TensorAccessor<int64_t, 1>& batched_csr_indices,
const TensorAccessor<float, 1>& batched_csr_weights,
int64_t pooling_mode,
const int* table_to_feature_offset);

template void batched_csr2csc<double>(
BatchedHyperCompressedSparseColumn& batched_csc,
int T,
int B,
const int64_t* batched_csr_offsets,
const int64_t* batched_csr_indices,
const double* batched_csr_weights,
const TensorAccessor<int64_t, 1>& batched_csr_offsets,
const TensorAccessor<int64_t, 1>& batched_csr_indices,
const TensorAccessor<double, 1>& batched_csr_weights,
int64_t pooling_mode,
const int* table_to_feature_offset);

Expand Down
6 changes: 3 additions & 3 deletions fbgemm_gpu/codegen/embedding_forward_split_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ void batched_csr2csc(
BatchedHyperCompressedSparseColumn& batched_csc,
int num_tables, // number of tables, not number of features
int B,
const int64_t* batched_csr_offsets,
const int64_t* batched_csr_indices,
const scalar_t* batched_csr_weights,
const at::TensorAccessor<int64_t, 1>& batched_csr_offsets,
const at::TensorAccessor<int64_t, 1>& batched_csr_indices,
const at::TensorAccessor<scalar_t, 1>& batched_csr_weights,
int64_t pooling_mode,
const int* table_to_feature_offset);
} // namespace internal

0 comments on commit 4c43051

Please sign in to comment.