From 6e7c891680b71dc25c26fada66a4e2dbcff38aa7 Mon Sep 17 00:00:00 2001 From: Sai Praveen Bangaru Date: Fri, 26 Apr 2024 16:41:53 -0400 Subject: [PATCH] Add logic for plain vector/matrix inputs to CUDA kernels - Adds basic tests for functionality - Adds wrapper logic to parse user input into storage types. --- slangtorch/util/builtin_wrappers.py | 69 ++++++++++++++++++++++++++++- tests/builtin-type-input.slang | 37 ++++++++++++++++ tests/test.py | 25 +++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 tests/builtin-type-input.slang diff --git a/slangtorch/util/builtin_wrappers.py b/slangtorch/util/builtin_wrappers.py index 9869e8c..511a556 100644 --- a/slangtorch/util/builtin_wrappers.py +++ b/slangtorch/util/builtin_wrappers.py @@ -1,4 +1,5 @@ from collections import namedtuple +import re import torch DiffTensorView = namedtuple('DiffTensorView', ['value', 'grad'], defaults=[None]) @@ -72,7 +73,73 @@ def accept_array(inp): return tuple, accept_array + +def make_vector_wrapper(module, typename, wrappedTypeMap, makeTypeWrapper): + typeInfoFnName = f"__typeinfo__{typename}" + if hasattr(module, typeInfoFnName): + typeInfoFn = getattr(module, typeInfoFnName) + (fieldnames, fieldtypenames) = typeInfoFn() + + # Vector types get converted into 'VectorStorage' types with an embedded array called "data". + # Our strategy here is to use the array wrapper to parse the data and pack it into a singleton tuple. + # + + assert len(fieldnames) == 1 + assert "data" in fieldnames + + elementType = fieldtypenames[0] + + # Get the wrapper for the element type + (_, innerArrayConvertFn) = makeTypeWrapper(module, elementType, wrappedTypeMap) + + def accept_vector(inp): + return tuple([innerArrayConvertFn(inp)]) + + wrappedTypeMap[typename] = (tuple, accept_vector) + + return tuple, accept_vector + + +def make_matrix_wrapper(module, typename, wrappedTypeMap, makeTypeWrapper): + typeInfoFnName = f"__typeinfo__{typename}" + if hasattr(module, typeInfoFnName): + typeInfoFn = getattr(module, typeInfoFnName) + (fieldnames, fieldtypenames) = typeInfoFn() + + # Matrix types get converted into 'MatrixStorage' types with an embedded array called "data". + # Our strategy here is to use the array wrapper to parse the data and pack it into a singleton tuple. + # + + assert len(fieldnames) == 1 + assert "data" in fieldnames + + # Parse matrix type name to get the element type and dimensions + # Find the first two numbers in the typename of the form 'NxM' + m = re.search(r'\d+x\d+', typename) + if m is None: + raise ValueError(f"Could not parse matrix typename {typename}") + dimensions = m.group(0).split('x') + assert len(dimensions) == 2 + + def accept_matrix(inp): + # Check that the input is a tuple of tuples + if not isinstance(inp, tuple): + raise ValueError(f"Expected tuple, got {type(inp)}") + if not all(isinstance(x, tuple) for x in inp): + raise ValueError(f"Expected tuple of tuples, got {inp}") + if not all(len(x) == int(dimensions[1]) for x in inp): + raise ValueError(f"Expected tuple of tuples of length {dimensions[1]}, got {inp}") + + # Flatten the input into a single tuple and nest it in another tuple + return (sum(inp, ()),) + + wrappedTypeMap[typename] = (tuple, accept_matrix) + + return tuple, accept_matrix + wrappers = { 'DiffTensorView': make_diff_tensor_view_wrapper, - 'Array_*': make_array_wrapper + 'Array_*': make_array_wrapper, + '_VectorStorage_*': make_vector_wrapper, + '_MatrixStorage_*': make_matrix_wrapper, } \ No newline at end of file diff --git a/tests/builtin-type-input.slang b/tests/builtin-type-input.slang new file mode 100644 index 0000000..f4ded76 --- /dev/null +++ b/tests/builtin-type-input.slang @@ -0,0 +1,37 @@ +[AutoPyBindCUDA] +[CUDAKernel] +void plain_copy_float3(float3 input, TensorView output) +{ + // Get the 'global' index of this thread. + uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim(); + + // If the thread index is beyond the input size, exit early. + if (dispatchIdx.x >= 1) + return; + + output[0] = input.x; + output[1] = input.y; + output[2] = input.z; +} + +[AutoPyBindCUDA] +[CUDAKernel] +void plain_copy_float3x3(float3x3 input, TensorView output) +{ + // Get the 'global' index of this thread. + uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim(); + + // If the thread index is beyond the input size, exit early. + if (dispatchIdx.x >= 1) + return; + + output[0] = input[0][0]; + output[1] = input[0][1]; + output[2] = input[0][2]; + output[3] = input[1][0]; + output[4] = input[1][1]; + output[5] = input[1][2]; + output[6] = input[2][0]; + output[7] = input[2][1]; + output[8] = input[2][2]; +} \ No newline at end of file diff --git a/tests/test.py b/tests/test.py index be17931..0639a32 100644 --- a/tests/test.py +++ b/tests/test.py @@ -469,3 +469,28 @@ def test_struct_failed_input(self): with self.assertRaises(TypeError): self.module.multiply(foo={'A': A, 'Ba': B}, result=Y).launchRaw(blockSize=(32, 32, 1), gridSize=(1, 1, 1)) + +class TestBuiltinTypeInputs(unittest.TestCase): + def setUp(self) -> None: + super().setUp() + test_dir = os.path.dirname(os.path.abspath(__file__)) + slangModuleSourceFile = os.path.join(test_dir, 'builtin-type-input.slang') + + module = slangtorch.loadModule(slangModuleSourceFile) + self.module = module + + def test_plain_vector_input(self): + Y = torch.tensor([0., 0., 0.]).cuda() + + self.module.plain_copy_float3(input=(1.0, 2.0, 3.0), output=Y).launchRaw(blockSize=(32, 1, 1), gridSize=(1, 1, 1)) + expected1 = torch.tensor([1., 2., 3.]).cpu() + + assert(torch.all(torch.eq(Y.cpu(), expected1))) + + def test_plain_matrix_input(self): + Y = torch.tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.]).cuda() + + self.module.plain_copy_float3x3(input=((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0)), output=Y).launchRaw(blockSize=(32, 1, 1), gridSize=(1, 1, 1)) + expected1 = torch.tensor([1., 2., 3., 4., 5., 6., 7., 8., 9.]).cpu() + + assert(torch.all(torch.eq(Y.cpu(), expected1))) \ No newline at end of file