Revert D25607505: Add formulas and basic tests

Test Plan: revert-hammer Differential Revision: D25607505 (pytorch@70f5905) Original commit changeset: fe2315d58768 fbshipit-source-id: 519d7426a6f32f0db51c4f360e5d5a79dbaac99d
yangdian96 · Apr 14, 2021 · 817fd93 · 817fd93
1 parent ed03a07
commit 817fd93
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 680 deletions.
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -21,9 +21,6 @@
 # Autograd tests use double as the default dtype
 torch.set_default_dtype(torch.double)
 
-# TODO(alband) Remove this when this flag is not needed anymore
-torch._C._set_forward_AD_enabled(True)
-
 from torch import nn
 from torch._six import inf, nan
 from torch.autograd.function import once_differentiable
@@ -6974,13 +6971,6 @@ def foo(a):
         self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0))
 
 class TestAutogradForwardMode(TestCase):
-    def tearDown(self):
-        # Ensure that a failing test won't make others fail
-        while fwAD._current_level >= 0:
-            fwAD.exit_dual_level()
-
-        super().tearDown()
-
     def test_forward_level_cleanup(self):
         def get_tensor_and_weak_ref():
             # Create a new Tensor and weak reference
@@ -7026,249 +7016,6 @@ def test_size_check(self):
 
             dual = fwAD.make_dual(foo, tangent[1:])
 
-    # The following test functions want to ensure all the following behaviors:
-    #   - Ensure that default level system in the python binding works
-    #   - Ensure that only level 0 exists and nesting is properly disabled
-    #   - Ensure that printing works fine
-    #   - Ensure that basic packing/unpacking works
-    #   - Ensure that advanced packing/unpacking works
-    #     - For memory / version counter share
-    #     - For backward AD (regular ops)
-    #   - Ensure that view + inplace for both modes work fine
-    #   - Ensure we do proper cleanup on exit of a level
-
-    def test_default_level(self):
-        foo = torch.rand(2)
-        bar = torch.rand(2)
-
-        with fwAD.dual_level():
-            baz = fwAD.make_dual(foo, bar)
-            baz_primal, baz_tangent = fwAD.unpack_dual(baz)
-        self.assertEqual(baz_primal, foo)
-        # We don't actually need to enforce that these two are the exact same python
-        # object, feel free to relax in the future
-        self.assertIs(baz_tangent, bar)
-
-        baz_primal, baz_tangent = fwAD.unpack_dual(baz)
-        self.assertEqual(baz_primal, foo)
-        self.assertEqual(baz_tangent, None)
-
-    def test_nested_level(self):
-        with fwAD.dual_level() as level:
-            # For now only level 0 exists
-            self.assertEqual(level, 0)
-
-        with fwAD.dual_level():
-            with self.assertRaisesRegex(RuntimeError, "Nested forward mode AD is not supported at the moment"):
-                nest_level = fwAD.enter_dual_level()
-
-    def test_print(self):
-        with fwAD.dual_level() as level:
-            a = torch.rand(3)
-            self.assertFalse("tangent=" in str(a))
-
-            b = fwAD.make_dual(a, torch.rand(3))
-            self.assertFalse("tangent=" in str(a))
-            self.assertTrue("tangent=" in str(b))
-
-            b_primal, b_tangent = fwAD.unpack_dual(b)
-            self.assertFalse("tangent=" in str(b_primal))
-            self.assertFalse("tangent=" in str(b_tangent))
-
-    def test_basic_packing_unpacking(self):
-        foo = torch.rand(2)
-        bar = torch.rand(2)
-
-        with fwAD.dual_level():
-            baz = fwAD.make_dual(foo, bar)
-            baz_primal, baz_tangent = fwAD.unpack_dual(baz)
-            self.assertEqual(baz_primal, foo)
-            self.assertIs(baz_tangent, bar)
-
-            # Check that packing/unpacking did not change the input
-            foo_primal, foo_tangent = fwAD.unpack_dual(foo)
-            self.assertEqual(foo_primal, foo)
-            self.assertIsNone(foo_tangent)
-
-    def test_advanced_packing_unpacking(self):
-        foo = torch.rand(2)
-        bar = torch.ones(2)
-
-        # Memory and version counter check
-        with fwAD.dual_level():
-            dual = fwAD.make_dual(foo, bar)
-
-            # Ensure that they are sharing memory and version counter
-            self.assertEqual(dual.storage().data_ptr(), foo.storage().data_ptr())
-
-            # Ensure we properly share the version counter
-            self.assertEqual(foo._version, dual._version)
-            foo.add_(1)
-            self.assertEqual(foo._version, dual._version)
-
-            # Unpacking should only create aliases as well
-            dual_primal, dual_tangent = fwAD.unpack_dual(dual)
-            self.assertEqual(dual_primal.storage().data_ptr(), foo.storage().data_ptr())
-            self.assertEqual(dual_tangent.storage().data_ptr(), bar.storage().data_ptr())
-            # And the tangent is actually re-used as-is so it is still the same Tensor
-            self.assertIs(dual_tangent, bar)
-
-            # Ensure we properly share the version counter
-            self.assertEqual(foo._version, dual_primal._version)
-            foo.add_(1)
-            self.assertEqual(foo._version, dual_primal._version)
-            self.assertEqual(bar._version, dual_tangent._version)
-            bar.add_(1)
-            self.assertEqual(bar._version, dual_tangent._version)
-
-        # backward mode check
-        with fwAD.dual_level():
-            foo.requires_grad_()
-            bar.requires_grad_()
-
-            # Check that backward gradients properly propagates through packing/unpacking
-            dual = fwAD.make_dual(foo, bar)
-            p, t = fwAD.unpack_dual(dual)
-
-            gfoo, gbar = torch.autograd.grad(p.sum(), (foo, bar), retain_graph=True, allow_unused=True)
-            self.assertEqual(gfoo, torch.ones_like(foo))
-            self.assertIsNone(gbar)
-
-            gfoo, gbar = torch.autograd.grad(t.sum(), (foo, bar), retain_graph=True, allow_unused=True)
-            self.assertIsNone(gfoo)
-            self.assertEqual(gbar, torch.ones_like(bar))
-
-            # Check that forward gradients are not impacted by detach
-            detached_dual = dual.detach()
-            out = detached_dual * 2
-            p, t = fwAD.unpack_dual(out)
-            self.assertFalse(p.requires_grad)
-            self.assertFalse(t.requires_grad)
-            self.assertEqual(p, foo * 2)
-            self.assertEqual(t, bar * 2)
-
-            # Check that forward gradients are not impacted by no_grad
-            with torch.no_grad():
-                out = dual * 3
-            p, t = fwAD.unpack_dual(out)
-            self.assertFalse(p.requires_grad)
-            self.assertFalse(t.requires_grad)
-            self.assertEqual(p, foo * 3)
-            self.assertEqual(t, bar * 3)
-
-            # Check that forward gradients are not impacted by inplace detach
-            dual = dual.clone()
-            dual.detach_()
-            out = dual * 2
-            p, t = fwAD.unpack_dual(out)
-            self.assertFalse(p.requires_grad)
-            self.assertFalse(t.requires_grad)
-            self.assertEqual(p, foo * 2)
-            self.assertEqual(t, bar * 2)
-
-    def test_view_inplace_non_differentiable_views(self):
-        original_foo = torch.rand(2)
-        original_bar = torch.ones(2)
-
-        # Do clones to be able to compare the values updated inplace
-        # with the original content of these Tensors
-        foo = original_foo.clone()
-        bar = original_bar.clone()
-
-        with fwAD.dual_level():
-            # Note that in this test, we use "update" to mean computing the right tangent for the dual
-            # All the inplace operations here are expected to update the primal value of the Tensors but
-            # not always their tangents.
-            # Also all mentions of "non differentiable view" here means non forward differentiable view
-            # unless specified otherwise.
-            # See note [Forward Grad View/inplace] for more details on how these views work.
-
-            # Check that inplace ops do not update non-differentiable views
-            # Non differentiable view
-            dual = fwAD.make_dual(foo, bar)
-            dual *= 2
-            # Check that non differentiable view's tangent was not updated
-            self.assertIsNone(fwAD.unpack_dual(foo)[1])
-            # Check that the computed result is correct
-            self.assertEqual(bar, original_bar * 2)
-            self.assertEqual(fwAD.unpack_dual(dual)[1], original_bar * 2)
-            self.assertEqual(foo, original_foo * 2)
-            self.assertEqual(fwAD.unpack_dual(dual)[0], original_foo * 2)
-            # Other non differentiable view
-            dual_primal, dual_tangent = fwAD.unpack_dual(dual)
-            self.assertIsNone(fwAD.unpack_dual(dual_primal)[1])
-            self.assertIsNone(fwAD.unpack_dual(dual_tangent)[1])
-            dual_primal *= 2
-            # Ensure dual's tangent did not change
-            self.assertEqual(fwAD.unpack_dual(dual)[0], original_foo * 4)
-            self.assertEqual(fwAD.unpack_dual(dual)[1], original_bar * 2)
-            dual_tangent *= 2
-            # Ensure dual's primal did not change
-            self.assertEqual(fwAD.unpack_dual(dual)[0], original_foo * 4)
-            self.assertEqual(fwAD.unpack_dual(dual)[1], original_bar * 4)
-
-
-    def test_view_inplace_differentiable_views(self):
-        original_foo = torch.rand(2)
-        original_bar = torch.ones(2)
-
-        # Do clones to be able to compare the values updated inplace
-        # with the original content of these Tensors
-        foo = original_foo.clone()
-        bar = original_bar.clone()
-
-        with fwAD.dual_level():
-            # Check that inplace ops do update differentiable view but stop at non differentiable ones
-            # A non differentiable view
-            dual = fwAD.make_dual(foo, bar)
-            # A differentiable view
-            view = dual.narrow(0, 0, 1)
-            view *= 2
-            # Check that non differentiable view was not updated
-            self.assertIsNone(fwAD.unpack_dual(foo)[1])
-            # Check that differentiable view was updated
-            self.assertEqual(fwAD.unpack_dual(dual)[1], torch.tensor([2., 1.]))
-            self.assertEqual(fwAD.unpack_dual(view)[1], torch.tensor([2.]))
-
-            # Check that we track differentiable view even for Tensors that are not dual
-            baz = torch.rand(2)
-            baz += dual
-            self.assertEqual(fwAD.unpack_dual(baz)[1], fwAD.unpack_dual(dual)[1])
-            # Updates on view should as well
-            baz = torch.rand(2)
-            baz[0] = dual[0]
-            self.assertEqual(fwAD.unpack_dual(baz)[1][0], fwAD.unpack_dual(dual)[1][0])
-            # Unused values get a gradient of 0
-            self.assertEqual(fwAD.unpack_dual(baz)[1][1], 0.)
-
-            # Check that backward non-differentiable views don't prevent gradient update
-            baz = torch.rand(2)
-            view = baz.detach()
-            view += dual
-            self.assertEqual(fwAD.unpack_dual(baz)[1], fwAD.unpack_dual(dual)[1])
-
-    def test_grad_cleanup(self):
-        foo = torch.rand(2)
-        bar = torch.rand(2)
-        baz = torch.rand(2)
-
-        with fwAD.dual_level():
-            dual = fwAD.make_dual(foo, bar)
-            self.assertIsNone(fwAD.unpack_dual(foo)[1])
-            self.assertIs(fwAD.unpack_dual(dual)[1], bar)
-
-        self.assertIsNone(fwAD.unpack_dual(dual)[1])
-
-        with fwAD.dual_level():
-            self.assertIsNone(fwAD.unpack_dual(foo)[1])
-            new_dual = fwAD.make_dual(foo, baz)
-
-            dual_primal, dual_tangent = fwAD.unpack_dual(dual)
-            new_dual_primal, new_dual_tangent = fwAD.unpack_dual(new_dual)
-            self.assertEqual(dual_primal, new_dual_primal)
-            self.assertIsNone(dual_tangent)
-            self.assertEqual(new_dual_tangent, baz)
-
 
 # Generic device type autograd tests.
 class TestAutogradDeviceType(TestCase):

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -97,24 +97,6 @@
 # like 'grad_output', and (2) the gradient to multiply with is always
 # called 'grad' (even though it really is a grad-grad).
 #
-# You can also add forward derivative definition by defining a formula for
-# a returned value (in general "result" if the name is not specified). This
-# formula works the same way as the backward one and advanced implementations
-# should also be placed in the FunctionsManual file.
-# This formula should compute a single Jacobian vector product using the (primal)
-# value of the argument "foo_p", its forward grad "foo_t" and the result of the
-# function as "result".
-# Note that the forward derivative can be automatically generated in two cases:
-#     - if your function is linear (NOT affine or multi-linear), then you can
-#       specify so by just using the string "auto_linear" for the formula.
-#     - if your function is applied element wise (and has a single input), you
-#       can specify so by just using the string "auto_element_wise" for the formula.
-#
-# Note that to avoid unpacking overhead, functions taking TensorList as inputs
-# will always have their forward grad formula called. This function is responsible
-# to check if any computation is needed and should return an undefined Tensor when
-# there is nothing to do. You can check "cat_forward" for a full example.
-#
 # NB: There are a number of gradient definitions in here which are bogus
 # (implemented using zeros_like).  These gradients are (hopefully) not
 # used by our frontend.  You MUST check the frontend code; search for
@@ -178,20 +160,13 @@
 # in Decalarations.yaml
 - name: abs(Tensor self) -> Tensor
   self: grad * self.sgn()
-  result: auto_element_wise
 
 - name: acos(Tensor self) -> Tensor
   self: grad * -((-self * self + 1).rsqrt()).conj()
 
 - name: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   self: handle_r_to_c(self.scalar_type(), grad)
   other: handle_r_to_c(other.scalar_type(), maybe_multiply(grad, alpha.conj()))
-  result: self_t + maybe_multiply(other_t, alpha)
-
-- name: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  self: handle_r_to_c(self.scalar_type(), grad)
-  other: handle_r_to_c(other.scalar_type(), maybe_multiply(grad, alpha.conj()))
-  result: self_t.add_(maybe_multiply(other_t, alpha))
 
 - name: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
   self: handle_r_to_c(self.scalar_type(), grad)
@@ -341,7 +316,6 @@
 
 - name: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
   self: grad
-  result: auto_linear
 
 - name: _coalesce(Tensor self) -> Tensor
   self: grad
@@ -816,12 +790,6 @@
 - name: mul.Tensor(Tensor self, Tensor other) -> Tensor
   self: mul_tensor_backward(grad, other, self.scalar_type())
   other: mul_tensor_backward(grad, self, other.scalar_type())
-  result: other_t * self_p.conj() + self_t * other_p.conj()
-
-- name: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  self: mul_tensor_backward(grad, other, self.scalar_type())
-  other: mul_tensor_backward(grad, self, other.scalar_type())
-  result: self_t.mul_(other_p.conj()).add_(other_t * (self_p / other_p).conj())
 
 - name: mul.Scalar(Tensor self, Scalar other) -> Tensor
   self: mul_tensor_backward(grad, at::scalar_to_tensor(other), self.scalar_type())
@@ -1010,7 +978,6 @@
 
 - name: select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
   self: select_backward(grad, self.sizes(), dim, index)
-  result: auto_linear
 
 - name: sigmoid(Tensor self) -> Tensor
   self: sigmoid_backward(grad, result)
@@ -1035,7 +1002,6 @@
 
 - name: slice.Tensor(Tensor(a) self, int dim=0, int? start=0, int? end=9223372036854775807, int step=1) -> Tensor(a)
   self: slice_backward_wrapper(grad, self.sizes(), dim, start, end, step)
-  result: auto_linear
 
 - name: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
   self: slogdet_backward(grad, self, sign, logabsdet)