Improve c2-aten tensor interop and add proper testing (#15860)

Summary: Pull Request resolved: pytorch/pytorch#15860 Few changes (which are harder to split in separate diffs, so together): - make conversion explicit (as they can throw to avoid surprises) - fix tensor legacy dispatch not initialized when tensor is created on C2 side - add a bunch of invariants to enforce Reviewed By: ezyang Differential Revision: D13596031 fbshipit-source-id: d20b601e06ba47aeff2f6e8e15769840e2d46108
sungin-h · Jan 29, 2019 · 5e21e0f · 5e21e0f
1 parent 9d6be6a
commit 5e21e0f
Show file tree

Hide file tree

Showing 12 changed files with 450 additions and 70 deletions.
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -176,9 +176,20 @@ struct CAFFE2_API AutoNonVariableTypeMode {
 inline Type& legacyTensorType(const TensorImpl& tensor) {
   // NB: It's valid to use getTypeRaw here, because the TensorImpl
   // could not have been created without initializing the Type first.
-  // TODO: This is not actually true via the Caffe2 codepath!  Make
-  // it so.
-  return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(tensor.type_id()), typeMetaToScalarType(tensor.dtype()), tensor.is_variable() && !at::NonVariableTypeMode::is_enabled());
+  // NB: This is not actually true via the Caffe2 codepath! But we call
+  // initializeLegacyTypeDispatchFor in the right place.
+  return *globalLegacyTypeDispatch().getTypeRaw(
+      tensorTypeIdToBackend(tensor.type_id()),
+      typeMetaToScalarType(tensor.dtype()),
+      tensor.is_variable() && !at::NonVariableTypeMode::is_enabled());
+}
+
+inline void initializeLegacyTypeDispatchFor(const TensorImpl& tensor) {
+  // getType calls the right initialization
+  globalLegacyTypeDispatch().getType(
+      tensorTypeIdToBackend(tensor.type_id()),
+      typeMetaToScalarType(tensor.dtype()),
+      tensor.is_variable() && !at::NonVariableTypeMode::is_enabled());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
@@ -6,6 +6,27 @@
 
 namespace at {
 
+void Tensor::enforce_invariants() {
+  if (impl_.get() == nullptr) {
+    throw std::runtime_error("TensorImpl with nullptr is not supported");
+  }
+  // Following line throws if the method is not a POD data type or is not
+  // supported by ATen
+  scalar_type();
+  if (defined()) {
+    AT_ASSERTM(
+        impl_->dtype_initialized(),
+        "Partially-initialized tensor not supported by at::Tensor");
+    AT_ASSERTM(
+        impl_->storage_initialized(),
+        "Partially-initialized tensor not supported by at::Tensor");
+    // Ensure LegacyTypeDispatch is initialized. In ATen it's done in tensor
+    // factory functions, but when we get a tensor from Caffe2 we might bypass
+    // those factory functions.
+    initializeLegacyTypeDispatchFor(*impl_);
+  }
+}
+
 void Tensor::print() const {
   if (defined()) {
     std::cerr << "[" << type().toString() << " " << sizes() << "]" << std::endl;

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
@@ -46,20 +46,34 @@ using TensorList = ArrayRef<Tensor>;
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
 class CAFFE2_API Tensor {
-public:
+ public:
   Tensor(){};
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit Tensor(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : impl_(std::move(tensor_impl)) {
     if (impl_.get() == nullptr) {
-      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
     }
   }
-
   Tensor(const Tensor&) = default;
   Tensor(Tensor&&) = default;
 
-  explicit Tensor(C10Tensor tensor)
-      : impl_(std::move(tensor).impl()) {}
+
+ public:
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static Tensor wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    Tensor r(std::move(tensor_impl));
+    r.enforce_invariants();
+    return r;
+  }
+
+  explicit Tensor(C10Tensor tensor) : impl_(std::move(tensor).impl()) {
+    enforce_invariants();
+  }
 
   explicit operator C10Tensor() const & {
     return C10Tensor(impl_);
@@ -708,6 +722,7 @@ class CAFFE2_API Tensor {
   friend struct WeakTensor;
 
 protected:
+  void enforce_invariants();
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 

diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
@@ -46,20 +46,34 @@ using TensorList = ArrayRef<Tensor>;
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
 class CAFFE2_API Tensor {
-public:
+ public:
   Tensor(){};
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit Tensor(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : impl_(std::move(tensor_impl)) {
     if (impl_.get() == nullptr) {
-      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
     }
   }
-
   Tensor(const Tensor&) = default;
   Tensor(Tensor&&) = default;
 
-  explicit Tensor(C10Tensor tensor)
-      : impl_(std::move(tensor).impl()) {}
+
+ public:
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static Tensor wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    Tensor r(std::move(tensor_impl));
+    r.enforce_invariants();
+    return r;
+  }
+
+  explicit Tensor(C10Tensor tensor) : impl_(std::move(tensor).impl()) {
+    enforce_invariants();
+  }
 
   explicit operator C10Tensor() const & {
     return C10Tensor(impl_);
@@ -313,6 +327,7 @@ class CAFFE2_API Tensor {
   friend struct WeakTensor;
 
 protected:
+  void enforce_invariants();
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 

diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
@@ -29,7 +29,8 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp)
 if (CUDNN_FOUND)
   list(APPEND ATen_CUDA_TEST_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cudnn_test.cpp)

diff --git a/aten/src/ATen/test/cuda_tensor_interop_test.cpp b/aten/src/ATen/test/cuda_tensor_interop_test.cpp
@@ -0,0 +1,164 @@
+#include "gtest/gtest.h"
+
+#include "ATen/ATen.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <caffe2/core/init.h>
+#include <caffe2/core/operator.h>
+#include <caffe2/core/context_gpu.h>
+#include <caffe2/utils/math.h>
+
+// dumbest possible copies
+template<typename T>
+T cuda_get(T* addr) {
+  T result;
+  CUDA_ENFORCE(cudaMemcpy(&result, addr, sizeof(T), cudaMemcpyDefault));
+  return result;
+}
+
+template<typename T>
+void cuda_set(T* addr, T value) {
+  CUDA_ENFORCE(cudaMemcpy(addr, &value, sizeof(T), cudaMemcpyDefault));
+}
+
+TEST(CUDACaffe2ToPytorch, SimpleLegacy) {
+  if (!at::cuda::is_available()) return;
+  caffe2::Tensor c2_tensor(caffe2::CUDA);
+  c2_tensor.Resize(4, 4);
+  auto data = c2_tensor.mutable_data<int64_t>();
+  {
+    caffe2::CUDAContext context;
+    caffe2::math::Set<int64_t>(16, 777, data, &context);
+  }
+  at::Tensor at_tensor(c2_tensor);
+  ASSERT_TRUE(&at_tensor.type() != nullptr);
+  ASSERT_TRUE(at_tensor.is_cuda());
+
+  auto at_cpu = at_tensor.cpu();
+  auto it = at_cpu.data<int64_t>();
+  for (int64_t i = 0; i < 16; i++) {
+    ASSERT_EQ(it[i], 777);
+  }
+}
+
+TEST(CUDACaffe2ToPytorch, Simple) {
+  if (!at::cuda::is_available()) return;
+  caffe2::Tensor c2_tensor =
+      caffe2::empty({4, 4}, at::dtype<int64_t>().device(caffe2::CUDA));
+  auto data = c2_tensor.mutable_data<int64_t>();
+  {
+    caffe2::CUDAContext context;
+    caffe2::math::Set<int64_t>(16, 777, data, &context);
+  }
+  at::Tensor at_tensor(c2_tensor);
+  ASSERT_TRUE(&at_tensor.type() != nullptr);
+  ASSERT_TRUE(at_tensor.is_cuda());
+
+  auto at_cpu = at_tensor.cpu();
+  auto it = at_cpu.data<int64_t>();
+  for (int64_t i = 0; i < 16; i++) {
+    ASSERT_EQ(it[i], 777);
+  }
+}
+
+TEST(CUDACaffe2ToPytorch, Op) {
+  if (!at::cuda::is_available()) return;
+  caffe2::Tensor c2_tensor =
+      caffe2::empty({3, 3}, at::dtype<int64_t>().device(caffe2::CUDA));
+  auto data = c2_tensor.mutable_data<int64_t>();
+  {
+    caffe2::CUDAContext context;
+    caffe2::math::Set<int64_t>(9, 111, data, &context);
+  }
+  at::Tensor at_tensor(c2_tensor);
+  ASSERT_TRUE(at_tensor.is_cuda());
+
+  ASSERT_EQ(at::sum(at_tensor).item<int64_t>(), 999);
+}
+
+TEST(CUDAPytorchToCaffe2, Op) {
+  if (!at::cuda::is_available()) return;
+  caffe2::Workspace workspace;
+  caffe2::NetDef net;
+
+  auto at_tensor_a = at::ones({5, 5}, at::dtype(at::kFloat).device(at::kCUDA));
+  auto at_tensor_b = at::ones({5, 5}, at::dtype(at::kFloat).device(at::kCUDA));
+  auto at_tensor_c = at::ones({5, 5}, at::dtype(at::kFloat).device(at::kCUDA));
+
+  auto* c2_tensor_a = BlobSetTensor(workspace.CreateBlob("a"), caffe2::Tensor(at_tensor_a));
+  auto* c2_tensor_b = BlobSetTensor(workspace.CreateBlob("b"), caffe2::Tensor(at_tensor_b));
+
+  // Test Alias
+  {
+    caffe2::Tensor c2_tensor_from_aten(at_tensor_c);
+    BlobSetTensor(workspace.CreateBlob("c"), c2_tensor_from_aten.Alias());
+  }
+
+  {
+    auto op = net.add_op();
+    op->set_type("Sum");
+    op->add_input("a");
+    op->add_input("b");
+    op->add_input("c");
+    op->add_output("d");
+    op->mutable_device_option()->set_device_type(caffe2::PROTO_CUDA);
+  }
+
+  workspace.RunNetOnce(net);
+
+  const auto& result = workspace.GetBlob("d")->Get<caffe2::Tensor>();
+  ASSERT_EQ(result.GetDeviceType(), caffe2::CUDA);
+
+  auto data = result.data<float>();
+  for (int64_t i = 0; i < 25; i++) {
+    ASSERT_EQ(cuda_get(data + i), 3.0);
+  }
+  at::Tensor at_result(result);
+  ASSERT_TRUE(at_result.is_cuda());
+  ASSERT_EQ(at::sum(at_result).item<float>(), 75);
+}
+
+TEST(CUDAPytorchToCaffe2, SharedStorageWrite) {
+  if (!at::cuda::is_available()) return;
+  auto at_tensor_a = at::ones({5, 5}, at::dtype(at::kFloat).device(at::kCUDA));
+  auto at_tensor_b = at_tensor_a.view({25});
+
+  caffe2::Tensor c2_tensor_a(at_tensor_a);
+  caffe2::Tensor c2_tensor_b(at_tensor_b);
+
+  // change is visible everywhere
+  cuda_set<float>(c2_tensor_a.mutable_data<float>() + 1, 123);
+  ASSERT_EQ(cuda_get(c2_tensor_b.mutable_data<float>() + 1), 123);
+  ASSERT_EQ(at_tensor_a[0][1].item().to<float>(), 123);
+  ASSERT_EQ(at_tensor_b[1].item().to<float>(), 123);
+}
+
+TEST(CUDAPytorchToCaffe2, MutualResizes) {
+  if (!at::cuda::is_available()) return;
+  auto at_tensor = at::ones({5, 5}, at::dtype(at::kFloat).device(at::kCUDA));
+
+  caffe2::Tensor c2_tensor(at_tensor);
+
+  // change is visible
+  cuda_set<float>(c2_tensor.mutable_data<float>(), 123);
+  ASSERT_EQ(at_tensor[0][0].item().to<float>(), 123);
+
+  // resize PT tensor in smaller direction - storage is preserved
+  at_tensor.resize_({4, 4});
+  cuda_set<float>(c2_tensor.mutable_data<float>() + 1, 234);
+  ASSERT_EQ(at_tensor[0][1].item().to<float>(), 234);
+
+  // resize PT tensor in larger direction - storage is preserved
+  at_tensor.resize_({6, 6});
+  cuda_set<float>(c2_tensor.mutable_data<float>() + 2, 345);
+  ASSERT_EQ(at_tensor[0][2].item().to<float>(), 345);
+  ASSERT_EQ(c2_tensor.sizes()[0], 6);
+  ASSERT_EQ(c2_tensor.sizes()[1], 6);
+
+  // resize Caffe2 tensor - semantics are to NOT preserve the data, but the
+  // TensorImpl is still shared
+  c2_tensor.Resize(7, 7);
+  cuda_set<float>(c2_tensor.mutable_data<float>() + 3, 456);
+  ASSERT_EQ(at_tensor[0][3].item().to<float>(), 456);
+  ASSERT_EQ(at_tensor.sizes()[0], 7);
+  ASSERT_EQ(at_tensor.sizes()[1], 7);
+}