fbgemm handles block size 1 sparse adagrad (pytorch#306)

Summary: Pull Request resolved: pytorch#306 Make fbgemm behave same for block size 1 as in the existing Caffe2 code so we don't need to separately handle block size 1 from Caffe2 code and just call fbgemm. Inside fbgemm, rely on the compiler to generate an efficient code for SparseAdaGrad with block size 1 instead of JIT'ing. Reviewed By: jianyuh Differential Revision: D19246900 fbshipit-source-id: 5fb3e03d7d9a9a8f7ed884616f5ce20e4e903b0b
GazeLei · Feb 27, 2020 · 967d4bc · 967d4bc
1 parent e1b1a55
commit 967d4bc
Showing 1 changed file with 64 additions and 0 deletions.
diff --git a/src/SparseAdagrad.cc b/src/SparseAdagrad.cc
@@ -640,6 +640,57 @@ GenSparseAdagrad<indxType, instSet>::getOrCreate(
       });
 } // getOrCreate
 
+// Specialization for block size 1 internally called by GenerateSparseAdaGrad
+template <typename IndexType>
+int SparseAdaGradBlockSize1_(
+    int num_rows, // number of rows reading
+    std::uint64_t param_size, // total number of parameters
+    float* w, // input/output parameters
+    const float* g, // input gradients
+    float* h, // input/output momentums
+    const IndexType* indices, // indices of each row
+    float epsilon,
+    float lr,
+    bool rowwise) {
+  for (int i = 0; i < num_rows; ++i) {
+    IndexType idx = indices[i];
+    if (idx >= param_size) {
+      return i;
+    }
+
+    float gi = g[i];
+    float hi = h[idx] = h[idx] + gi * gi;
+    if (rowwise) {
+      w[idx] += lr / (std::sqrt(hi) + epsilon) * gi;
+    } else {
+      w[idx] += lr * gi / (std::sqrt(hi) + epsilon);
+    }
+  }
+  return num_rows;
+}
+
+template int SparseAdaGradBlockSize1_(
+    int num_rows, // number of rows reading
+    std::uint64_t param_size, // total number of parameters
+    float* w, // input parameters
+    const float* g, // input gradients
+    float* h, // input momentums
+    const std::int64_t* indices, // indices of each row
+    float epsilon,
+    float lr,
+    bool rowwise);
+
+template int SparseAdaGradBlockSize1_(
+    int num_rows, // number of rows reading
+    std::uint64_t param_size, // total number of parameters
+    float* w, // input parameters
+    const float* g, // input gradients
+    float* h, // input momentums
+    const std::int32_t* indices, // indices of each row
+    float epsilon,
+    float lr,
+    bool rowwise);
+
 } // namespace
 
 template <typename IndexType>
@@ -652,6 +703,19 @@ typename SparseAdaGradSignature<IndexType>::Type GenerateSparseAdaGrad(
   }
 
   if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+    if (block_size == 1) {
+      return [=](int num_rows, // number of rows reading
+                 std::uint64_t param_size, // total number of parameters
+                 float* w, // input/output parameters
+                 const float* g, // input gradients
+                 float* h, // input/output momentums
+                 const IndexType* indices, // indices of each row
+                 float epsilon,
+                 float lr) {
+        return SparseAdaGradBlockSize1_(
+            num_rows, param_size, w, g, h, indices, epsilon, lr, rowwise);
+      };
+    }
     static GenSparseAdagrad<IndexType, inst_set_t::avx2> kernel_generator;
     constexpr int VLEN = simd_info<inst_set_t::avx2>::WIDTH_32BIT_ELEMS;
     const int* mask_avx2 = &internal::avx2_ps_or_epi32_combined_mask