fix gconv + acc16 (#59541)

Summary: Pull Request resolved: pytorch/pytorch#59541 Pull Request resolved: pytorch#621 Fixing 2 issues. These are actually 2 independent issues one in Caffe2 and another in FBGEMM, so no need to wait until FBGEMM is synchronized with PyTorch 1) conv 16-bit accumulation doesn't support fast gconv path, so TakeGConvFastPath_ should honor it 2) packed_index_ generates indices up to (G/GTogether_) F R S OC_per_G GTogether_ paddedICPerG which can exceed G kernel_prod OC_per_G paddedICPerG allocated in PackWeightMatrixForGConv (kernel_prod = F R S): e.g., when G=3, GTogether_=2, we allocate 3 F R S OC_per_G paddedICPerG but we access up to 2 F R S OC_per_G 2 paddedICPerG BTW, not sure how we haven't known about this issue for so long. Any idea will be really appreciated. Reviewed By: dskhudia Differential Revision: D28927214 fbshipit-source-id: 3ec98ea2fc177545392a0148daca592d80f40ad3
classicvalues · Jun 8, 2021 · 70a4042 · 70a4042
1 parent 2ddcfb6
commit 70a4042
Showing 1 changed file with 7 additions and 6 deletions.
diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc
@@ -24,6 +24,10 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
+  GTogether_ = numOfGroupsTogether(conv_param_);
+  assert(
+      GTogether_ <= conv_param_.G &&
+      "Number of groups together smaller than total number of groups");
   if (!pdata) {
     bufAllocatedHere_ = true;
     int kernel_prod = std::accumulate(
@@ -32,17 +36,14 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
     int paddedICPerG = ((conv_param_.IC / conv_param_.G) + 3) / 4 * 4;
     pdata_ = static_cast<T*>(fbgemmAlignedAlloc(
         64,
-        conv_param_.G * kernel_prod * (conv_param_.OC / conv_param_.G) *
-            paddedICPerG * sizeof(T)));
+        (conv_param_.G + GTogether_ - 1) / GTogether_ * GTogether_ *
+            kernel_prod * (conv_param_.OC / conv_param_.G) * paddedICPerG *
+            sizeof(T)));
   } else {
     bufAllocatedHere_ = false;
     pdata_ = pdata;
   }
 
-  GTogether_ = numOfGroupsTogether(conv_param_);
-  assert(
-      GTogether_ <= conv_param_.G &&
-      "Number of groups together smaller than total number of groups");
   pack();
 }