[foreach][mta] Inplace maximum and minimum (pytorch#82523)

crcrpar · pytorchmergebot · commit 3139722679a9 · 2022-08-03T03:40:42.000Z
### Description  Implement `torch._foreach_maximum_` and `torch._foreach_minimum_` mainly for `_multi_tensor_adam` and `_multi_tensor_adamw` with `amsgrad=True` to correctly update their `max_exp_avg_sqs`. ### Issue  - pytorch#78807 - pytorch#81894 - pytorch#81348 - pytorch#81705 - pytorch#58833 - pytorch#68041 ### Testing  Updated `test_foreach.py::TestForeach::_minmax_test` to compare the outputs of `_foreach_maximum_` (and `_foreach_minimum_`) against those of `[torch.maximum(a, b) for a, b in zip(tensors1, tensors2)]` cc @ngimel @albanD @mikaylagawarecki Pull Request resolved: pytorch#82523 Approved by: https://github.com/albanD
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -199,6 +199,9 @@ FOREACH_POINTWISE_OP_SCALAR(addcmul);
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv);
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul);
 
+// NOTE(crcrpar): It didn't seem feasible to use `self[i]` as both the first and the last
+// arguments of `maximum_out` and `minimum_out` so I tentatively embarrassingly get and copy
+// the result to `self[i]`.
 #define FOREACH_MAXIMUM_MINIMUM_OP(NAME)                                                     \
 std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList tensors2) { \
   check_foreach_api_restrictions(tensors1, tensors2);                                        \
@@ -211,6 +214,13 @@ std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList
                                                                                              \
   return result;                                                                             \
 }                                                                                            \
+void foreach_tensor_##NAME##_slow_(TensorList self, TensorList other) {                      \
+  check_foreach_api_restrictions(self, other);                                               \
+  for (const auto i : c10::irange(self.size())) {                                            \
+    const auto tmp = at::NAME(self[i], other[i]);                                            \
+    self[i].copy_(tmp, /* non_blocking */ true);                                             \
+  }                                                                                          \
+}
 
 FOREACH_MAXIMUM_MINIMUM_OP(maximum)
 FOREACH_MAXIMUM_MINIMUM_OP(minimum)
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -188,20 +188,45 @@ std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors1, TensorList
     tensor_lists.emplace_back(std::move(vec_res));                                                         \
                                                                                                            \
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, tensors1[0].scalar_type(), "foreach_maximum_minimum_op_cuda", [&]() { \
-        using opmath_t = at::opmath_type<scalar_t>;                                                 \
+        using opmath_t = at::opmath_type<scalar_t>;                                                        \
         auto op = []  GPU_LAMBDA (opmath_t a, opmath_t b) -> opmath_t {                                    \
             opmath_t c = a OP b ? a : b;                                                                   \
             if (_isnan(a)) {                                                                               \
               c = a;                                                                                       \
             }                                                                                              \
             return c;};                                                                                    \
         multi_tensor_apply<3>(tensor_lists,                                                                \
-                              PointwiseOpListFunctor<scalar_t, 3>(),                                       \
-                              op);                                                                         \
+                              BinaryOpListAlphaFunctor<scalar_t, 3, 2, 2>(),                               \
+                              op,                                                                          \
+                              opmath_t(1));                                                                \
     });                                                                                                    \
                                                                                                            \
     return tensor_lists[2];                                                                                \
 }                                                                                                          \
+                                                                                                           \
+void foreach_tensor_##NAME##_cuda_(TensorList self, TensorList other) {                                    \
+  check_foreach_api_restrictions(self, other);                                                             \
+  if (!can_use_fast_route({self, other}) || has_bool_tensor(self)) {                                       \
+    return at::native::foreach_tensor_##NAME##_slow_(self, other);                                         \
+  }                                                                                                        \
+                                                                                                           \
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, self[0].scalar_type(), "foreach_maximum_minimum_op_cuda_",  \
+    [&]() {                                                                                                \
+      using opmath_t = at::opmath_type<scalar_t>;                                                          \
+      std::vector<std::vector<at::Tensor>> tensor_lists{self.vec(), other.vec()};                          \
+      auto op = [] GPU_LAMBDA (opmath_t a, opmath_t b) -> opmath_t {                                       \
+        opmath_t c = a OP b ? a : b;                                                                       \
+        if (_isnan(a)) {                                                                                   \
+          c = a;                                                                                           \
+        }                                                                                                  \
+        return c;                                                                                          \
+      };                                                                                                   \
+      multi_tensor_apply<2>(tensor_lists,                                                                  \
+                            BinaryOpListAlphaFunctor<scalar_t, 2, 2, 0>(),                                 \
+                            op,                                                                            \
+                            opmath_t(1));                                                                  \
+  });                                                                                                      \
+}                                                                                                          \
 
 FOREACH_MAXIMUM_MINIMUM_OP(maximum, >)
 FOREACH_MAXIMUM_MINIMUM_OP(minimum, <)
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -9412,13 +9412,29 @@
     CPU: foreach_tensor_maximum_slow
     CUDA: foreach_tensor_maximum_cuda
 
+- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_maximum_slow_
+    CUDA: foreach_tensor_maximum_cuda_
+  autogen: _foreach_maximum.List_out
+
 - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
   dispatch:
     CPU: foreach_tensor_minimum_slow
     CUDA: foreach_tensor_minimum_cuda
 
+- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_minimum_slow_
+    CUDA: foreach_tensor_minimum_cuda_
+  autogen: _foreach_minimum.List_out
+
 - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
diff --git a/test/test_foreach.py b/test/test_foreach.py
@@ -97,7 +97,7 @@ def is_cuda(self):
     # note(mkozuki): It might be the case that the expected number of `cudaLaunchKernel`s
     # is greater than 1 once foreach functions internally separate their input `TensorList`s by
     # devices & dtypes into vectors of tensors.
-    def _get_funcs(self, op, n_expected_cudaLaunchKernels):
+    def _get_funcs(self, op, n_expected_cudaLaunchKernels: int):
         return (
             ForeachFuncWrapper(op.method_variant, n_expected_cudaLaunchKernels),
             RegularFuncWrapper(op.ref),
@@ -370,11 +370,17 @@ def test_unary_slowpath(self, device, dtype, op):
         for N in N_values:
             self._test_unary(device, dtype, op, N, is_fastpath=False)
 
+    # note(crcrpar): `torch.maximum` and `torch.minimum` support `out` arg but there seem to be no inplace versions.
+    # So, compare `inplace_op` results with `ref`'s outputs.
     def _minmax_test(self, opinfo, inputs, is_fastpath, n_expected_cudaLaunchKernels):
-        op, ref, _, _ = self._get_funcs(opinfo, n_expected_cudaLaunchKernels)
-        self.assertEqual(ref(inputs), op(inputs, self.is_cuda, is_fastpath))
+        op, ref, inplace_op, _ = self._get_funcs(opinfo, n_expected_cudaLaunchKernels)
+        expected = ref(inputs)
+        self.assertEqual(expected, op(inputs, self.is_cuda, is_fastpath))
+
+        inplace_inputs = [[t.clone() for t in inputs[0]], inputs[1]]
+        inplace_op(inplace_inputs, self.is_cuda, is_fastpath)
+        self.assertEqual(expected, inplace_inputs[0])
 
-    # note(mkozuki): in-place of foreach_minimum and foreach_maximum aren't implemented.
     @ops(foreach_minmax_op_db)
     def test_minmax_fastpath(self, device, dtype, op):
         for N in N_values:
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
@@ -377,7 +377,7 @@ def _multi_tensor_adam(params: List[Tensor],
 
         if amsgrad:
             # Maintains the maximum of all 2nd moment running avg. till now
-            max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
+            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
 
             # Use the max. for normalizing running avg. of gradient
             max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
@@ -405,7 +405,7 @@ def _multi_tensor_adam(params: List[Tensor],
 
         if amsgrad:
             # Maintains the maximum of all 2nd moment running avg. till now
-            max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
+            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)
 
             # Use the max. for normalizing running avg. of gradient
             max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
@@ -369,7 +369,7 @@ def _multi_tensor_adamw(params: List[Tensor],
 
         if amsgrad:
             # Maintains the maximum of all 2nd moment running avg. till now
-            max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
+            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)
 
             # Use the max. for normalizing running avg. of gradient
             max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
@@ -397,7 +397,7 @@ def _multi_tensor_adamw(params: List[Tensor],
 
         if amsgrad:
             # Maintains the maximum of all 2nd moment running avg. till now
-            max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
+            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)
 
             # Use the max. for normalizing running avg. of gradient
             max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -6726,7 +6726,7 @@ def sample_inputs_foreach(self, device, dtype, N, *, noncontiguous=False, same_s
 def get_foreach_method_names(name):
     # get torch inplace reference function
     op_name = "_foreach_" + name
-    inplace_op_name = "_foreach_" + name + "_"
+    inplace_op_name = op_name + "_"
 
     op = getattr(torch, op_name, None)
     inplace_op = getattr(torch, inplace_op_name, None)