diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index a52ea72d915d16..7a6eccc82b8725 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -19,6 +19,22 @@ void check_size_nonnegative(IntArrayRef size) { } } +size_t computeStorageNbytes( + IntArrayRef sizes, + IntArrayRef strides, + size_t itemsize_bytes) { + // size of the underlying storage is 1 bigger than the offset + // of the last element according to stride + size_t size = 1; + for (const auto i : c10::irange(sizes.size())) { + if(sizes[i] == 0) { + return 0; + } + size += strides[i]*(sizes[i]-1); + } + return size * itemsize_bytes; +} + TensorBase empty_generic( IntArrayRef size, c10::Allocator* allocator, @@ -54,6 +70,29 @@ TensorBase empty_generic( return tensor; } +TensorBase empty_strided_generic( + IntArrayRef size, + IntArrayRef stride, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type) { + at::detail::check_size_nonnegative(size); + + caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); + int64_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize()); + auto storage_impl = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + size_bytes, + allocator->allocate(size_bytes), + allocator, + /*resizeable=*/true); + + auto tensor = detail::make_tensor_base( + std::move(storage_impl), ks, dtype); + tensor.unsafeGetTensorImpl()->set_sizes_and_strides(size, stride); + return tensor; +} + TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory, c10::optional memory_format_opt) { auto allocator = GetCPUAllocatorMaybePinned(pin_memory); @@ -88,4 +127,41 @@ TensorBase empty_cpu( options.memory_format_opt()); } +TensorBase empty_strided_cpu(IntArrayRef size, IntArrayRef stride, + ScalarType dtype, bool pin_memory) { + auto allocator = at::detail::GetCPUAllocatorMaybePinned(pin_memory); + constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); + return at::detail::empty_strided_generic( + size, stride, allocator, cpu_ks, dtype); +} + +TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt) { + auto device = device_or_default(device_opt); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::CPU); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); + + auto pin_memory = pinned_memory_or_default(pin_memory_opt); + auto dtype = dtype_or_default(dtype_opt); + return at::detail::empty_strided_cpu(size, stride, dtype, pin_memory); +} + +TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions &options) { + return at::detail::empty_strided_cpu( + size, + stride, + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt()); +} + }} // namespace at::detail diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index 0f2bc0c63ea073..f08374f5f4dbf0 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -5,6 +5,8 @@ namespace at { namespace detail { TORCH_API void check_size_nonnegative(IntArrayRef size); +TORCH_API size_t computeStorageNbytes( + IntArrayRef sizes, IntArrayRef strides, size_t itemsize); TORCH_API TensorBase empty_generic( IntArrayRef size, @@ -13,6 +15,13 @@ TORCH_API TensorBase empty_generic( ScalarType scalar_type, c10::optional memory_format_opt); +TORCH_API TensorBase empty_strided_generic( + IntArrayRef size, + IntArrayRef stride, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type); + TORCH_API TensorBase empty_cpu( IntArrayRef size, ScalarType dtype, @@ -31,4 +40,23 @@ TORCH_API TensorBase empty_cpu( IntArrayRef size, const TensorOptions &options); +TORCH_API TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + bool pin_memory=false); + +TORCH_API TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt); + +TORCH_API TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions &options); + }} // namespace at::detail diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 3426bff7b4b8da..754c73bb6154ce 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -317,22 +317,6 @@ std::vector defaultStrides(IntArrayRef sizes) { return strides; } -size_t computeStorageNbytes( - IntArrayRef sizes, - IntArrayRef strides, - size_t itemsize_bytes) { - // size of the underlying storage is 1 bigger than the offset - // of the last element according to stride - size_t size = 1; - for (const auto i : c10::irange(sizes.size())) { - if(sizes[i] == 0) { - return 0; - } - size += strides[i]*(sizes[i]-1); - } - return size * itemsize_bytes; -} - // On a high level, // 1. separate `oldshape` into chunks of dimensions, where the dimensions are // ``contiguous'' in each chunk, i.e., oldstride[i] = oldshape[i+1] * diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index 1417174a1f6d3b..f018c33f1aeae4 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -152,8 +153,6 @@ TORCH_API void check_dim_size( namespace detail { TORCH_API std::vector defaultStrides(IntArrayRef sizes); -TORCH_API size_t -computeStorageNbytes(IntArrayRef sizes, IntArrayRef strides, size_t itemsize); TORCH_API c10::optional> computeStride( IntArrayRef oldshape, diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp index 58e58044fe7baf..2dbf292dc755d1 100644 --- a/aten/src/ATen/native/MetaTensor.cpp +++ b/aten/src/ATen/native/MetaTensor.cpp @@ -60,12 +60,20 @@ Tensor empty_strided_meta( c10::optional device_opt, c10::optional pin_memory_opt ) { - auto t = at::native::empty_meta({0}, dtype_opt, layout_opt, device_opt, pin_memory_opt); - // Amazingly the CPU implementation will work for us, because most of resize - // is generic except the memcpy, but the memcpy will be skipped if the source - // storage is nullptr (which it always is, for meta tensors) - at::native::resize_impl_cpu_(t.unsafeGetTensorImpl(), size, stride); - return t; + auto device = device_or_default(device_opt); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::Meta); + // NB: because there is no SparseMeta (yet), non-strided layout is + // exerciseable + TORCH_CHECK_NOT_IMPLEMENTED( + layout_or_default(layout_opt) == Layout::Strided, + "strided meta tensors not supported yet" + ); + + auto* allocator = GetMetaAllocator(); + auto dtype = dtype_or_default(dtype_opt); + constexpr c10::DispatchKeySet meta_ks(c10::DispatchKey::Meta); + return at::detail::empty_strided_generic( + size, stride, allocator, meta_ks, dtype); } } // namespace native diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 9a360b2179ef84..c1593d3693328e 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -201,10 +201,7 @@ Tensor empty( Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - check_size_nonnegative(size); - auto t = at::native::empty_cpu({0}, dtype_opt, layout_opt, device_opt, pin_memory_opt); - at::native::resize_impl_cpu_(t.unsafeGetTensorImpl(), size, stride); - return t; + return at::detail::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); } Tensor& empty_out(IntArrayRef size, diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index 84c5f410981b40..790f812ab60280 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -65,9 +66,13 @@ Tensor empty_cuda(IntArrayRef size, c10::optional dtype_opt, c10::op } Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - auto t = at::native::empty_cuda({0}, dtype_opt, layout_opt, device_opt, pin_memory_opt); - at::native::resize_impl_cuda_(t.unsafeGetTensorImpl(), size, stride); - return t; + TORCH_CHECK(device_or_default(device_opt).is_cuda()); + TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned"); + auto* allocator = at::cuda::getCUDADeviceAllocator(); + auto dtype = dtype_or_default(dtype_opt); + constexpr c10::DispatchKeySet cuda_ks(c10::DispatchKey::CUDA); + return at::detail::empty_strided_generic( + size, stride, allocator, cuda_ks, dtype); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~