correct use of contiguous in cpu embedding ops (pytorch#588)

Summary: Pull Request resolved: pytorch#588 contiguous() doesn't mutate and returns a new tensor. Minimize contiguous calls by using TensorAccessor in csr2csc Reviewed By: jianyuh Differential Revision: D27560846 fbshipit-source-id: 87f244662e230cfc19a7ea813d0291c9379a0886
joebos · Apr 5, 2021 · 4c43051 · 4c43051
1 parent c109fc2
commit 4c43051
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 30 deletions.
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
@@ -42,9 +42,11 @@ void split_embedding_backward_exact_cpu_kernel(
       batched_csc,
       num_tables,
       B,
-      offsets.data_ptr<int64_t>(),
-      indices.data_ptr<int64_t>(),
-      indice_weights.defined() ? indice_weights.data_ptr<grad_t>() : nullptr,
+      offsets.accessor<int64_t, 1>(),
+      indices.accessor<int64_t, 1>(),
+      indice_weights.defined()
+          ? indice_weights.accessor<grad_t, 1>()
+          : TensorAccessor<grad_t, 1>(nullptr, nullptr, nullptr),
       pooling_mode,
       table_to_feature_offset);
   std::vector<int>& table_ptr = batched_csc.table_ptr;
@@ -198,10 +200,6 @@ void split_embedding_backward_exact_cpu_dense_kernel(
   const auto momentum2_offsets_data = momentum2_offsets.accessor<int64_t, 1>();
   {% endif %}
 
-  offsets.contiguous();
-  indices.contiguous();
-  indice_weights.contiguous();
-
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       host_weights.scalar_type(), "split_embedding_backward_exact_cpu", [&]() {
         split_embedding_backward_exact_cpu_kernel<scalar_t>(

diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
@@ -81,18 +81,18 @@ void split_embedding_forward_cpu_kernel(
   int64_t B = (offsets.size(0) - 1) / T;
   TORCH_CHECK(B > 0);
 
-  offsets.contiguous();
-  indices.contiguous();
-  weights.contiguous();
+  TORCH_CHECK(weights.is_contiguous());
+  indices = indices.contiguous();
+  offsets = offsets.contiguous();
   if (indice_weights.defined()) {
-    indice_weights.contiguous();
+    indice_weights = indice_weights.contiguous();
   }
 
   const auto D_offsets_data = D_offsets.accessor<int, 1>();
   const auto weights_offsets_data = weights_offsets.accessor<int64_t, 1>();
-  const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
-  const auto offsets_data = offsets.data_ptr<int64_t>();
   const auto indices_data = indices.data_ptr<int64_t>();
+  const auto offsets_data = offsets.data_ptr<int64_t>();
+  const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
 
   const auto weights_data = weights.data_ptr<weights_t>();
   // If indice_weights not defined, then this accessor won't be used.
@@ -344,16 +344,17 @@ void batched_csr2csc(
     int num_tables, // number of tables, not number of features
     int B,
     // TODO: use accessor for the following 3 parameters
-    const int64_t* batched_csr_offsets,
-    const int64_t* batched_csr_indices,
-    const scalar_t* batched_csr_weights,
+    const TensorAccessor<int64_t, 1>& batched_csr_offsets,
+    const TensorAccessor<int64_t, 1>& batched_csr_indices,
+    const TensorAccessor<scalar_t, 1>& batched_csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset) {
   batched_csc.num_tables = num_tables;
   batched_csc.table_ptr.resize(num_tables + 1);
   int64_t nnz = batched_csr_offsets[table_to_feature_offset[num_tables] * B];
   batched_csc.row_indices.resize(nnz);
-  if (batched_csr_weights || pooling_mode == MEAN) {
+  bool has_weights = batched_csr_weights.data() != nullptr;
+  if (has_weights || pooling_mode == MEAN) {
     batched_csc.weights.resize(nnz);
   }
 
@@ -372,13 +373,11 @@ void batched_csr2csc(
         int64_t L = pool_end - pool_begin;
         // MEAN pooling will not work with indice_weights!
         double scale_factor =
-            (pooling_mode == MEAN && !batched_csr_weights && L > 0) ? 1.0 / L
-                                                                    : 1.0;
+            (pooling_mode == MEAN && !has_weights && L > 0) ? 1.0 / L : 1.0;
         for (int64_t p = pool_begin; p < pool_end; ++p) {
           non_empty_columns[batched_csr_indices[p]].emplace_back(
               feature * B + b,
-              scale_factor *
-                  (batched_csr_weights ? batched_csr_weights[p] : 1.0f));
+              scale_factor * (has_weights ? batched_csr_weights[p] : 1.0f));
         }
       }
     } // for each feature
@@ -408,19 +407,19 @@ template void batched_csr2csc<float>(
     BatchedHyperCompressedSparseColumn& batched_csc,
     int T,
     int B,
-    const int64_t* batched_csr_offsets,
-    const int64_t* batched_csr_indices,
-    const float* batched_csr_weights,
+    const TensorAccessor<int64_t, 1>& batched_csr_offsets,
+    const TensorAccessor<int64_t, 1>& batched_csr_indices,
+    const TensorAccessor<float, 1>& batched_csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset);
 
 template void batched_csr2csc<double>(
     BatchedHyperCompressedSparseColumn& batched_csc,
     int T,
     int B,
-    const int64_t* batched_csr_offsets,
-    const int64_t* batched_csr_indices,
-    const double* batched_csr_weights,
+    const TensorAccessor<int64_t, 1>& batched_csr_offsets,
+    const TensorAccessor<int64_t, 1>& batched_csr_indices,
+    const TensorAccessor<double, 1>& batched_csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset);
 

diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.h b/fbgemm_gpu/codegen/embedding_forward_split_cpu.h
@@ -51,9 +51,9 @@ void batched_csr2csc(
     BatchedHyperCompressedSparseColumn& batched_csc,
     int num_tables, // number of tables, not number of features
     int B,
-    const int64_t* batched_csr_offsets,
-    const int64_t* batched_csr_indices,
-    const scalar_t* batched_csr_weights,
+    const at::TensorAccessor<int64_t, 1>& batched_csr_offsets,
+    const at::TensorAccessor<int64_t, 1>& batched_csr_indices,
+    const at::TensorAccessor<scalar_t, 1>& batched_csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset);
 } // namespace internal