[lang] Merge triplets in the same position when building GPU sparse m…

…atrix (taichi-dev#6605) Issue: taichi-dev#2906 ### Brief Summary When building GPU sparse matrix, cuSparse API requires three separated arrays: row index ptr, col index ptr, and values ptr. However, the sparse matrix builder only uses one ndarray to store all triplets, the memory layout is like: [row, col, value, row, col, value, ...]. In this pr, I retrieve all data from ndarray and merge all triplets in the same position of the sparse matrix. Then, all triplets are stored in three separate arrays. At last, these three arrays are used to build sparse matrix using cuSparse API.
ares201005 · Nov 21, 2022 · fc8b6ec · fc8b6ec
1 parent d05a0a3
commit fc8b6ec
Show file tree

Hide file tree

Showing 9 changed files with 181 additions and 240 deletions.
diff --git a/misc/test_build_cusm_from_coo.py b/misc/test_build_cusm_from_coo.py
diff --git a/misc/test_coo_cusolver.py b/misc/test_coo_cusolver.py
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
@@ -1,12 +1,13 @@
 from functools import reduce
 
 import numpy as np
+from taichi._lib import core as _ti_core
 from taichi.lang._ndarray import Ndarray, ScalarNdarray
 from taichi.lang.exception import TaichiRuntimeError
 from taichi.lang.field import Field
 from taichi.lang.impl import get_runtime
 from taichi.lang.util import warning
-from taichi.types import annotations, f32, i32
+from taichi.types import annotations, f32
 
 
 class SparseMatrix:
@@ -206,30 +207,6 @@ def build_from_ndarray(self, ndarray):
                 'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
             )
 
-    def build_coo(self, row_coo, col_coo, value_coo):
-        """Build a CSR format sparse matrix from COO format inputs.
-
-        Args:
-            row_indices (ti.ndarray): the row indices of the matrix entries.
-            col_indices (ti.ndarray): the column indices of the matrix entries.
-            data (ti.ndarray): the entries of the matrix.
-
-        Raises:
-            TaichiRuntimeError: If the inputs are not ``ti.ndarray`` or the datatypes of the ndarray are not correct.
-        """
-        if not isinstance(row_coo, Ndarray) or not isinstance(
-                col_coo, Ndarray) or not isinstance(value_coo, Ndarray):
-            raise TaichiRuntimeError(
-                'Sparse matrix only supports COO format building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray].'
-            )
-        elif value_coo.dtype != f32 or row_coo.dtype != i32 or col_coo.dtype != i32:
-            raise TaichiRuntimeError(
-                'Sparse matrix only supports COO fromat building from float32 data and int32 row/col indices.'
-            )
-        else:
-            get_runtime().prog.make_sparse_matrix_from_ndarray_cusparse(
-                self.matrix, row_coo.arr, col_coo.arr, value_coo.arr)
-
 
 class SparseMatrixBuilder:
     """A python wrap around sparse matrix builder.
@@ -270,8 +247,15 @@ def print_triplets(self):
 
     def build(self, dtype=f32, _format='CSR'):
         """Create a sparse matrix using the triplets"""
-        sm = self.ptr.build()
-        return SparseMatrix(sm=sm)
+        taichi_arch = get_runtime().prog.config().arch
+        if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64:
+            sm = self.ptr.build()
+            return SparseMatrix(sm=sm)
+        if taichi_arch == _ti_core.Arch.cuda:
+            sm = self.ptr.build_cuda()
+            return SparseMatrix(sm=sm)
+        raise TaichiRuntimeError(
+            'Sparse matrix only supports CPU and CUDA backends.')
 
 
 # TODO: remove this in 1.0 release

diff --git a/python/taichi/linalg/sparse_solver.py b/python/taichi/linalg/sparse_solver.py
@@ -51,7 +51,7 @@ def compute(self, sparse_matrix):
         if isinstance(sparse_matrix, SparseMatrix):
             self.matrix = sparse_matrix
             taichi_arch = taichi.lang.impl.get_runtime().prog.config().arch
-            if taichi_arch == _ti_core.Arch.x64:
+            if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64:
                 self.solver.compute(sparse_matrix.matrix)
             elif taichi_arch == _ti_core.Arch.cuda:
                 self.analyze_pattern(self.matrix)

diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
@@ -1,5 +1,6 @@
 #include "taichi/program/sparse_matrix.h"
 
+#include <map>
 #include <sstream>
 #include <string>
 #include <unordered_map>
@@ -145,6 +146,58 @@ std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build() {
   return sm;
 }
 
+std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build_cuda() {
+  TI_ASSERT(built_ == false);
+  built_ = true;
+  auto sm = make_cu_sparse_matrix(rows_, cols_, dtype_);
+#ifdef TI_WITH_CUDA
+  num_triplets_ = ndarray_data_base_ptr_->read_int(std::vector<int>{0});
+  std::map<int, std::tuple<int, int, float32>> entries;
+  for (auto i = 0; i < num_triplets_; i++) {
+    auto idx = 3 * i + 1;
+    auto row = ndarray_data_base_ptr_->read_int(std::vector<int>{idx});
+    auto col = ndarray_data_base_ptr_->read_int(std::vector<int>{idx + 1});
+    auto val = ndarray_data_base_ptr_->read_float(std::vector<int>{idx + 2});
+    auto e_idx = row * cols_ + col;
+    if (entries.find(e_idx) == entries.end()) {
+      entries[e_idx] = std::make_tuple(row, col, val);
+    } else {
+      auto [r, c, v] = entries[e_idx];
+      entries[e_idx] = std::make_tuple(r, c, v + val);
+    }
+  }
+  auto entry_size = entries.size();
+  int *row_host = (int *)malloc(sizeof(int) * entry_size);
+  int *col_host = (int *)malloc(sizeof(int) * entry_size);
+  float32 *value_host = (float32 *)malloc(sizeof(float32) * entry_size);
+  int count = 0;
+  for (auto entry : entries) {
+    auto [row, col, value] = entry.second;
+    row_host[count] = row;
+    col_host[count] = col;
+    value_host[count] = value;
+    count++;
+  }
+  void *row_device = nullptr, *col_device = nullptr, *value_device = nullptr;
+  CUDADriver::get_instance().malloc(&row_device, entry_size * sizeof(int));
+  CUDADriver::get_instance().malloc(&col_device, entry_size * sizeof(int));
+  CUDADriver::get_instance().malloc(&value_device,
+                                    entry_size * sizeof(float32));
+  CUDADriver::get_instance().memcpy_host_to_device(row_device, (void *)row_host,
+                                                   entry_size * sizeof(int));
+  CUDADriver::get_instance().memcpy_host_to_device(col_device, (void *)col_host,
+                                                   entry_size * sizeof(int));
+  CUDADriver::get_instance().memcpy_host_to_device(
+      value_device, (void *)value_host, entry_size * sizeof(float32));
+  sm->build_csr_from_coo(row_device, col_device, value_device, entry_size);
+  clear();
+  free(row_host);
+  free(col_host);
+  free(value_host);
+#endif
+  return sm;
+}
+
 void SparseMatrixBuilder::clear() {
   built_ = false;
   ndarray_data_base_ptr_->write_int(std::vector<int>{0}, 0);
@@ -286,14 +339,20 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
       &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr,
       coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
-  CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
-  CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
-  CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
-  // TODO: free csr_row_offset_ptr
-  // CUDADriver::get_instance().mem_free(csr_row_offset_ptr);
-  CUDADriver::get_instance().mem_free(d_values_sorted);
-  CUDADriver::get_instance().mem_free(d_permutation);
-  CUDADriver::get_instance().mem_free(dbuffer);
+  if (vec_permutation)
+    CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
+  if (vec_values)
+    CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
+  if (cusparse_handle)
+    CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
+  if (coo_row_ptr)
+    CUDADriver::get_instance().mem_free(coo_row_ptr);
+  if (d_values_sorted)
+    CUDADriver::get_instance().mem_free(d_values_sorted);
+  if (d_permutation)
+    CUDADriver::get_instance().mem_free(d_permutation);
+  if (dbuffer)
+    CUDADriver::get_instance().mem_free(dbuffer);
   csr_row_ptr_ = csr_row_offset_ptr;
   csr_col_ind_ = coo_col_ptr;
   csr_val_ = coo_values_ptr;
@@ -303,21 +362,14 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
 
 CuSparseMatrix::~CuSparseMatrix() {
 #if defined(TI_WITH_CUDA)
-  CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
-#endif
-}
-void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
-                                              SparseMatrix &sm,
-                                              const Ndarray &row_coo,
-                                              const Ndarray &col_coo,
-                                              const Ndarray &val_coo) {
-#if defined(TI_WITH_CUDA)
-  size_t coo_row_ptr = prog->get_ndarray_data_ptr_as_int(&row_coo);
-  size_t coo_col_ptr = prog->get_ndarray_data_ptr_as_int(&col_coo);
-  size_t coo_val_ptr = prog->get_ndarray_data_ptr_as_int(&val_coo);
-  int nnz = val_coo.get_nelement();
-  sm.build_csr_from_coo((void *)coo_row_ptr, (void *)coo_col_ptr,
-                        (void *)coo_val_ptr, nnz);
+  if (matrix_)
+    CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
+  if (csr_row_ptr_)
+    CUDADriver::get_instance().mem_free(csr_row_ptr_);
+  if (csr_col_ind_)
+    CUDADriver::get_instance().mem_free(csr_col_ind_);
+  if (csr_val_)
+    CUDADriver::get_instance().mem_free(csr_val_);
 #endif
 }
 

diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
@@ -28,6 +28,8 @@ class SparseMatrixBuilder {
 
   std::unique_ptr<SparseMatrix> build();
 
+  std::unique_ptr<SparseMatrix> build_cuda();
+
   void clear();
 
  private:
@@ -287,7 +289,7 @@ class CuSparseMatrix : public SparseMatrix {
   }
 
  private:
-  cusparseSpMatDescr_t matrix_;
+  cusparseSpMatDescr_t matrix_{nullptr};
   void *csr_row_ptr_{nullptr};
   void *csr_col_ind_{nullptr};
   void *csr_val_{nullptr};
@@ -310,9 +312,4 @@ std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(cusparseSpMatDescr_t mat,
 void make_sparse_matrix_from_ndarray(Program *prog,
                                      SparseMatrix &sm,
                                      const Ndarray &ndarray);
-void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
-                                              SparseMatrix &sm,
-                                              const Ndarray &row_indices,
-                                              const Ndarray &col_indices,
-                                              const Ndarray &values);
 }  // namespace taichi::lang
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -421,15 +421,6 @@ void export_lang(py::module &m) {
                          "SparseMatrix only supports CPU and CUDA for now.");
              return make_sparse_matrix_from_ndarray(program, sm, ndarray);
            })
-      .def("make_sparse_matrix_from_ndarray_cusparse",
-           [](Program *program, CuSparseMatrix &sm, const Ndarray &row_coo,
-              const Ndarray &col_coo, const Ndarray &val_coo) {
-             TI_ERROR_IF(
-                 !arch_is_cuda(program->this_thread_config().arch),
-                 "SparseMatrix based on GPU only supports CUDA for now.");
-             return make_sparse_matrix_from_ndarray_cusparse(
-                 program, sm, row_coo, col_coo, val_coo);
-           })
       .def("no_activate",
            [](Program *program, SNode *snode) {
              // TODO(#2193): Also apply to @ti.func?
@@ -1199,6 +1190,7 @@ void export_lang(py::module &m) {
       .def("print_triplets", &SparseMatrixBuilder::print_triplets)
       .def("get_ndarray_data_ptr", &SparseMatrixBuilder::get_ndarray_data_ptr)
       .def("build", &SparseMatrixBuilder::build)
+      .def("build_cuda", &SparseMatrixBuilder::build_cuda)
       .def("get_addr", [](SparseMatrixBuilder *mat) { return uint64(mat); });
 
   py::class_<SparseMatrix>(m, "SparseMatrix")