Skip to content

Commit

Permalink
[lang] Reorder sparse matrix before solving (taichi-dev#6886)
Browse files Browse the repository at this point in the history
Issue: taichi-dev#2906 

### Brief Summary
When solving a sparse linear system without reordering the system
matrix, `analyze_pattern` and `factorize` would consume a lot of GPU
memory. For example, the `res` in the `stable fluid` example can not be
larger than 500. After reordering the sparse system matrix, we can set
`res=512` now.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
FantasyVR and pre-commit-ci[bot] authored Dec 20, 2022
1 parent 9cbc59a commit e9e8edd
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 20 deletions.
2 changes: 1 addition & 1 deletion python/taichi/examples/simulation/stable_fluid.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
help='The arch (backend) to run this example on')
args, unknowns = parser.parse_known_args()

res = 256
res = 512
dt = 0.03
p_jacobi_iters = 500 # 40 for a quicker but less accurate result
f_strength = 10000.0
Expand Down
153 changes: 135 additions & 18 deletions taichi/program/sparse_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,11 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) {
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t colsA = A->num_cols();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();
void *d_csrValA = A->get_val_ptr();
CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_);
CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_);
CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_);
Expand All @@ -210,12 +212,68 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) {
CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_,
CUSPARSE_INDEX_BASE_ZERO);

// step 1: create opaque info structure
// step 1: reorder the sparse matrix
float *h_csrValA = nullptr;
h_Q = (int *)malloc(sizeof(int) * colsA);
h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1));
h_csrColIndB = (int *)malloc(sizeof(int) * nnzA);
h_csrValB = (float *)malloc(sizeof(float) * nnzA);
h_csrValA = (float *)malloc(sizeof(float) * nnzA);
h_mapBfromA = (int *)malloc(sizeof(int) * nnzA);
assert(nullptr != h_Q);
assert(nullptr != h_csrRowPtrB);
assert(nullptr != h_csrColIndB);
assert(nullptr != h_csrValB);
assert(nullptr != h_mapBfromA);

CUDADriver::get_instance().memcpy_device_to_host(h_csrRowPtrB, d_csrRowPtrA,
sizeof(int) * (rowsA + 1));
CUDADriver::get_instance().memcpy_device_to_host(h_csrColIndB, d_csrColIndA,
sizeof(int) * nnzA);
CUDADriver::get_instance().memcpy_device_to_host(h_csrValA, d_csrValA,
sizeof(float) * nnzA);

// compoute h_Q
CUSOLVERDriver::get_instance().csSpXcsrsymamdHost(
cusolver_handle_, rowsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, h_Q);
CUDADriver::get_instance().malloc((void **)&d_Q, sizeof(int) * colsA);
CUDADriver::get_instance().memcpy_host_to_device((void *)d_Q, (void *)h_Q,
sizeof(int) * (colsA));
size_t size_perm = 0;
CUSOLVERDriver::get_instance().csSpXcsrperm_bufferSizeHost(
cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB,
h_Q, h_Q, &size_perm);
void *buffer_cpu = (void *)malloc(sizeof(char) * size_perm);
assert(nullptr != buffer_cpu);
for (int j = 0; j < nnzA; j++) {
h_mapBfromA[j] = j;
}
CUSOLVERDriver::get_instance().csSpXcsrpermHost(
cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB,
h_Q, h_Q, h_mapBfromA, buffer_cpu);
// B = A( mapBfromA )
for (int j = 0; j < nnzA; j++) {
h_csrValB[j] = h_csrValA[h_mapBfromA[j]];
}
CUDADriver::get_instance().malloc((void **)&d_csrRowPtrB,
sizeof(int) * (rowsA + 1));
CUDADriver::get_instance().malloc((void **)&d_csrColIndB, sizeof(int) * nnzA);
CUDADriver::get_instance().malloc((void **)&d_csrValB, sizeof(float) * nnzA);
CUDADriver::get_instance().memcpy_host_to_device(
(void *)d_csrRowPtrB, (void *)h_csrRowPtrB, sizeof(int) * (rowsA + 1));
CUDADriver::get_instance().memcpy_host_to_device(
(void *)d_csrColIndB, (void *)h_csrColIndB, sizeof(int) * nnzA);
CUDADriver::get_instance().memcpy_host_to_device(
(void *)d_csrValB, (void *)h_csrValB, sizeof(float) * nnzA);
free(h_csrValA);
free(buffer_cpu);

// step 2: create opaque info structure
CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_);

// step 2: analyze chol(A) to know structure of L
// step 3: analyze chol(A) to know structure of L
CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis(
cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrA, d_csrColIndA, info_);
cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrB, d_csrColIndB, info_);
is_analyzed_ = true;
#else
TI_NOT_IMPLEMENTED
Expand All @@ -229,26 +287,23 @@ void CuSparseSolver::factorize(const SparseMatrix &sm) {
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();
void *d_csrValA = A->get_val_ptr();

size_t size_internal = 0;
size_t size_chol = 0; // size of working space for csrlu
// step 1: workspace for chol(A)
CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, &size_internal, &size_chol);
cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB,
d_csrColIndB, info_, &size_internal, &size_chol);

if (size_chol > 0)
CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol);

// step 2: compute A = L*L^T
CUSOLVERDriver::get_instance().csSpScsrcholFactor(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, gpu_buffer_);
cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB,
d_csrColIndB, info_, gpu_buffer_);
// step 3: check if the matrix is singular
const double tol = 1.e-14;
const float tol = 1.e-14;
int singularity = 0;
CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_,
tol, &singularity);
Expand Down Expand Up @@ -439,16 +494,46 @@ void CuSparseSolver::solve_rf(Program *prog,
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t colsA = A->num_cols();
size_t d_b = prog->get_ndarray_data_ptr_as_int(&b);
size_t d_x = prog->get_ndarray_data_ptr_as_int(&x);
CUSOLVERDriver::get_instance().csSpScsrcholSolve(
cusolver_handle_, rowsA, (void *)d_b, (void *)d_x, info_, gpu_buffer_);

// TODO: free allocated memory and handles
// CUDADriver::get_instance().mem_free(gpu_buffer_);
// CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_);
// CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_);
// CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA);
// step 1: d_Qb = Q * b
void *d_Qb = nullptr;
CUDADriver::get_instance().malloc(&d_Qb, sizeof(float) * rowsA);
cusparseDnVecDescr_t vec_b;
cusparseSpVecDescr_t vec_Qb;
CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_b, (int)rowsA, (void *)d_b,
CUDA_R_32F);
CUSPARSEDriver::get_instance().cpCreateSpVec(
&vec_Qb, (int)rowsA, (int)rowsA, d_Q, d_Qb, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
CUSPARSEDriver::get_instance().cpGather(cusparse_handel_, vec_b, vec_Qb);

// step 2: solve B*z = Q*b using cholesky solver
void *d_z = nullptr;
CUDADriver::get_instance().malloc(&d_z, sizeof(float) * colsA);
CUSOLVERDriver::get_instance().csSpScsrcholSolve(
cusolver_handle_, rowsA, (void *)d_Qb, (void *)d_z, info_, gpu_buffer_);

// step 3: Q*x = z
cusparseSpVecDescr_t vecX;
cusparseDnVecDescr_t vecY;
CUSPARSEDriver::get_instance().cpCreateSpVec(
&vecX, (int)colsA, (int)colsA, d_Q, d_z, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, (int)colsA, (void *)d_x,
CUDA_R_32F);
CUSPARSEDriver::get_instance().cpScatter(cusparse_handel_, vecX, vecY);

if (d_Qb != nullptr)
CUDADriver::get_instance().mem_free(d_Qb);
if (d_z != nullptr)
CUDADriver::get_instance().mem_free(d_z);
CUSPARSEDriver::get_instance().cpDestroySpVec(vec_Qb);
CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_b);
CUSPARSEDriver::get_instance().cpDestroySpVec(vecX);
CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY);
#else
TI_NOT_IMPLEMENTED
#endif
Expand Down Expand Up @@ -492,6 +577,38 @@ std::unique_ptr<SparseSolver> make_sparse_solver(DataType dt,
TI_ERROR("Not supported sparse solver type: {}", solver_type);
}

CuSparseSolver::~CuSparseSolver() {
#if defined(TI_WITH_CUDA)
if (h_Q != nullptr)
free(h_Q);
if (h_csrRowPtrB != nullptr)
free(h_csrRowPtrB);
if (h_csrColIndB != nullptr)
free(h_csrColIndB);
if (h_csrValB != nullptr)
free(h_csrValB);
if (h_mapBfromA != nullptr)
free(h_mapBfromA);
if (info_ != nullptr)
CUSOLVERDriver::get_instance().csSpDestroyCsrcholInfo(info_);
if (cusolver_handle_ != nullptr)
CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_);
if (cusparse_handel_ != nullptr)
CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_);
if (descr_ != nullptr)
CUSPARSEDriver::get_instance().cpDestroyMatDescr(descr_);
if (gpu_buffer_ != nullptr)
CUDADriver::get_instance().mem_free(gpu_buffer_);
if (d_Q != nullptr)
CUDADriver::get_instance().mem_free(d_Q);
if (d_csrRowPtrB != nullptr)
CUDADriver::get_instance().mem_free(d_csrRowPtrB);
if (d_csrColIndB != nullptr)
CUDADriver::get_instance().mem_free(d_csrColIndB);
if (d_csrValB != nullptr)
CUDADriver::get_instance().mem_free(d_csrValB);
#endif
}
std::unique_ptr<SparseSolver> make_cusparse_solver(
DataType dt,
const std::string &solver_type,
Expand Down
11 changes: 10 additions & 1 deletion taichi/program/sparse_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,18 @@ class CuSparseSolver : public SparseSolver {
bool is_analyzed_{false};
bool is_factorized_{false};

int *h_Q{nullptr}; /* <int> n, B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */
int *d_Q{nullptr};
int *h_csrRowPtrB{nullptr}; /* <int> n+1 */
int *h_csrColIndB{nullptr}; /* <int> nnzA */
float *h_csrValB{nullptr}; /* <float> nnzA */
int *h_mapBfromA{nullptr}; /* <int> nnzA */
int *d_csrRowPtrB{nullptr}; /* <int> n+1 */
int *d_csrColIndB{nullptr}; /* <int> nnzA */
float *d_csrValB{nullptr}; /* <float> nnzA */
public:
CuSparseSolver();
~CuSparseSolver() override = default;
~CuSparseSolver() override;
bool compute(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
Expand Down
2 changes: 2 additions & 0 deletions taichi/rhi/cuda/cusolver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ PER_CUSOLVER_FUNCTION(csSpCreate, cusolverSpCreate, cusolverSpHandle_t * );
PER_CUSOLVER_FUNCTION(csSpDestory, cusolverSpDestroy, cusolverSpHandle_t );
PER_CUSOLVER_FUNCTION(csSpXcsrissymHost, cusolverSpXcsrissymHost, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,void *);
PER_CUSOLVER_FUNCTION(csSpXcsrsymrcmHost, cusolverSpXcsrsymrcmHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,void *);
PER_CUSOLVER_FUNCTION(csSpXcsrsymamdHost, cusolverSpXcsrsymamdHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,void *);
PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSizeHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,size_t *);
PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *);
PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *);

// cusolver preview API
PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*);
PER_CUSOLVER_FUNCTION(csSpDestroyCsrcholInfo, cusolverSpDestroyCsrcholInfo, csrcholInfo_t);
PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t );
PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,size_t *,size_t *);
PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *);
Expand Down
2 changes: 2 additions & 0 deletions taichi/rhi/cuda/cusparse_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermuta
PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*);
PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*);
PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
PER_CUSPARSE_FUNCTION(cpScatter, cusparseScatter, cusparseHandle_t, cusparseSpVecDescr_t, cusparseDnVecDescr_t);
PER_CUSPARSE_FUNCTION(cpSsctr, cusparseSsctr, cusparseHandle_t, int, void*, void*, void*, cusparseIndexBase_t);
PER_CUSPARSE_FUNCTION(cpSetPointerMode, cusparseSetPointerMode, cusparseHandle_t, cusparsePointerMode_t);
PER_CUSPARSE_FUNCTION(cpCsrSetPointers, cusparseCsrSetPointers, cusparseSpMatDescr_t,void*, void*, void*);

Expand Down

0 comments on commit e9e8edd

Please sign in to comment.