Skip to content

Commit

Permalink
Merge pull request kokkos#50 from kokkos/cuda-spgemm-fix
Browse files Browse the repository at this point in the history
Cuda spgemm fix
  • Loading branch information
crtrott authored Aug 22, 2017
2 parents 8cdbd31 + 6c21bd8 commit e323bcb
Show file tree
Hide file tree
Showing 11 changed files with 77 additions and 37 deletions.
11 changes: 6 additions & 5 deletions src/common/KokkosKernels_IOUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
#ifndef _KOKKOSKERNELSIOUTILS_HPP
#define _KOKKOSKERNELSIOUTILS_HPP


#include "Kokkos_ArithTraits.hpp"
#include <Kokkos_Core.hpp>
#include "KokkosKernels_SimpleUtils.hpp"
#include <sys/stat.h>
Expand Down Expand Up @@ -188,19 +188,19 @@ void kk_diagonally_dominant_sparseMatrix_generate(
ScalarType total_values = 0;
for(SizeType k=rowPtr[row] ;k<rowPtr[row+1] - 1;k++)
{
OrdinalType pos = -1;
OrdinalType pos = row;
while (pos == row){
pos = (1.0*rand()/INT_MAX-0.5)*bandwidth+row;
pos = ((1.0*rand())/INT_MAX-0.5)*bandwidth+row;
}
if(pos<0) pos+=ncols;

if(pos>=ncols) pos-=ncols;
colInd[k]= pos;
values[k] = 100.0*rand()/INT_MAX-50.0;
total_values += values[k];
total_values += Kokkos::Details::ArithTraits<ScalarType>::abs(values[k]);
}

colInd[rowPtr[row+1] - 1]= row;

values[rowPtr[row+1] - 1] = total_values * temp;
}
}
Expand Down Expand Up @@ -359,6 +359,7 @@ crsMat_t kk_generate_sparse_matrix(
Kokkos::deep_copy (rowmap_view , hr);
Kokkos::deep_copy (columns_view , hc);
Kokkos::deep_copy (values_view , hv);
Kokkos::fence();
}

graph_t static_graph (columns_view, rowmap_view);
Expand Down
2 changes: 2 additions & 0 deletions src/common/KokkosKernels_SparseUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ void kk_sort_graph(
Kokkos::deep_copy (he, in_adj);
typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
Kokkos::deep_copy (hv, in_vals);
MyExecSpace::fence();

typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj);
typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals);
Expand Down Expand Up @@ -631,6 +632,7 @@ void kk_sort_graph(

Kokkos::deep_copy (out_adj, heo);
Kokkos::deep_copy (out_vals, hvo);
MyExecSpace::fence();
}
else {

Expand Down
13 changes: 6 additions & 7 deletions src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,7 @@ template <typename MyExecSpace, typename data_type>
class UniformMemoryPool{

private:
typedef bool lock_type;
//typedef typename Kokkos::View <size_t *, MyExecSpace> index_view_t;
typedef int lock_type;
typedef typename Kokkos::View <lock_type *, MyExecSpace> lock_view_t;
typedef typename Kokkos::View <data_type *, MyExecSpace> data_view_t;

Expand Down Expand Up @@ -203,9 +202,9 @@ class UniformMemoryPool{
data_view = data_view_t(Kokkos::ViewAllocateWithoutInitializing("pool data"), overall_size),
data = (data_view.ptr_on_device()),


this->set_pool_type(pool_type_);
Kokkos::deep_copy(data_view, initialized_value);

}

/**
Expand Down Expand Up @@ -256,6 +255,7 @@ class UniformMemoryPool{
std::cout << "num_chunks:" << num_chunks << std::endl;
std::cout << "chunk_size:" << chunk_size << std::endl;
std::cout << "overall_size:" << overall_size << std::endl;
std::cout << "modular_num_chunks:" << modular_num_chunks << std::endl;

//std::cout << "Printing free_chunks view" << std::endl;
//print_1Dview(free_chunks, print_all);
Expand Down Expand Up @@ -293,11 +293,9 @@ class UniformMemoryPool{

KOKKOS_INLINE_FUNCTION
data_type *get_arbitrary_free_chunk(const size_t &thread_index, const size_t max_tries) const{
//size_t chunk_index = thread_index % num_chunks;
size_t chunk_index = thread_index & modular_num_chunks;
size_t num_try = 0;
while(!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, false, true)){
//chunk_index = (chunk_index + 1) % num_chunks;
while(!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, 0, 1)){
chunk_index = (chunk_index + 1) & modular_num_chunks;
++num_try;
if (num_try > max_tries){
Expand Down Expand Up @@ -335,7 +333,8 @@ class UniformMemoryPool{
void release_arbitrary_chunk(const data_type *chunk_ptr) const{
size_t alloc_index = (chunk_ptr - data) / chunk_size;
//printf("release:%ld #chunks:%ld\n", alloc_index, num_chunks);
chunk_locks(alloc_index) = false;
//chunk_locks(alloc_index) = false;
chunk_locks(alloc_index) = 0;
}

/**
Expand Down
5 changes: 5 additions & 0 deletions src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,15 @@ class GaussSeidel{
for (nnz_lno_t i = 0; i < numColors; ++i){
nnz_lno_t color_index_begin = h_color_xadj(i);
nnz_lno_t color_index_end = h_color_xadj(i + 1);

if (color_index_begin + 1 >= color_index_end ) continue;
auto colorsubset =
subview(color_adj, Kokkos::pair<row_lno_t, row_lno_t> (color_index_begin, color_index_end));
MyExecSpace::fence();
Kokkos::sort (colorsubset);
//TODO: MD 08/2017: If I remove the below fence, code fails on cuda.
//I do not see any reason yet it to fail.
MyExecSpace::fence();
}
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/sparse/impl/KokkosSparse_spgemm_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ class KokkosSPGEMM{
bool transposeB;

const size_t shmem_size;
const size_t concurrency;
size_t concurrency;
const bool use_dynamic_schedule;
const bool KOKKOSKERNELS_VERBOSE;
//const int KOKKOSKERNELS_VERBOSE = 1;
Expand Down
7 changes: 5 additions & 2 deletions src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
// ************************************************************************
//@HEADER
*/

namespace KokkosSparse{

namespace Impl{
Expand Down Expand Up @@ -634,6 +633,7 @@ void
chunksize += min_hash_size ; //this is for the hash begins
chunksize += max_nnz; //this is for hash nexts
int num_chunks = concurrency / suggested_vector_size;

#ifdef KOKKOSKERNELSCHANGEPARAMS

if (env_hash > 2) {
Expand All @@ -649,7 +649,9 @@ num_chunks = env_num_chunks;
if (KOKKOSKERNELS_VERBOSE){
std::cout << "\t\t max_nnz: " << max_nnz
<< " chunk_size:" << chunksize
<< " min_hash_size:" << min_hash_size
<< " min_hash_size:" << min_hash_size
<< " concurrency:" << concurrency
<< " MyExecSpace::concurrency():" << MyExecSpace::concurrency()
<< " numchunks:" << num_chunks << std::endl;
}

Expand All @@ -667,6 +669,7 @@ num_chunks = env_num_chunks;
std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl;
std::cout << "\t\tPool Size(MB):" <<
sizeof (nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024. << std::endl;
m_space.print_memory_pool();
}

PortableNumericCHASH<
Expand Down
4 changes: 2 additions & 2 deletions unit_test/sparse/Test_Sparse_findRelOffset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,12 @@ namespace Test{ // (anonymous)
nIVT indsToSearch_view("indsToSearch", numEnt);
typename nIVT::HostMirror h_indsToSearch_view = Kokkos::create_mirror_view (indsToSearch_view);
for (int i = 0; i < numEnt; ++i) {
std::cout << "indsToSearch[i]:" << indsToSearch[i] << std::endl;
//std::cout << "indsToSearch[i]:" << indsToSearch[i] << std::endl;
h_indsToSearch_view(i) = indsToSearch[i];
}
Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
Kokkos::fence();
KokkosKernels::Impl::kk_print_1Dview(indsToSearch_view);
//KokkosKernels::Impl::kk_print_1Dview(indsToSearch_view);
const bool isSorted = true;

for (lno_t hint = 0; hint < 10; ++hint) {
Expand Down
15 changes: 12 additions & 3 deletions unit_test/sparse/Test_Sparse_gauss_seidel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,17 +179,25 @@ void test_gauss_seidel(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_

lno_t nv = input_mat.numRows();

//KokkosKernels::Impl::print_1Dview(input_mat.graph.row_map);
//KokkosKernels::Impl::print_1Dview(input_mat.graph.entries);
//KokkosKernels::Impl::print_1Dview(input_mat.values);

//scalar_view_t solution_x ("sol", nv);
//Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
//Kokkos::fill_random(solution_x,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);

const scalar_view_t solution_x = create_x_vector<scalar_view_t>(nv);
scalar_view_t y_vector = create_y_vector(input_mat, solution_x);

#ifdef gauss_seidel_testmore
GSAlgorithm gs_algorithms[] ={GS_DEFAULT, GS_TEAM, GS_PERMUTED};

int apply_count = 3;
for (int ii = 0; ii < 3; ++ii){
#else
int apply_count = 1;
GSAlgorithm gs_algorithms[] ={GS_DEFAULT};
for (int ii = 0; ii < 1; ++ii){
#endif
GSAlgorithm gs_algorithm = gs_algorithms[ii];
scalar_view_t x_vector ("x vector", nv);
const scalar_t alpha = 1.0;
Expand All @@ -208,7 +216,8 @@ void test_gauss_seidel(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_


for (int is_symmetric_graph = 0; is_symmetric_graph < 2; ++is_symmetric_graph){
for (int apply_type = 0; apply_type < 3; ++apply_type){

for (int apply_type = 0; apply_type < apply_count; ++apply_type){
for (int skip_symbolic = 0; skip_symbolic < 2; ++skip_symbolic){
for (int skip_numeric = 0; skip_numeric < 2; ++skip_numeric){

Expand Down
2 changes: 1 addition & 1 deletion unit_test/sparse/Test_Sparse_replaceSumInto.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ namespace { // (anonymous)
typename CrsMatrixType::values_type val = A.values;
typename CrsMatrixType::values_type::HostMirror val_h = Kokkos::create_mirror_view (val);
Kokkos::deep_copy (val_h, val);

Kokkos::fence();
const LO numRows = A.numRows ();
bool success = true;
for (LO lclRow = 0; lclRow < numRows; ++lclRow) {
Expand Down
16 changes: 6 additions & 10 deletions unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ namespace Test {

ordinal_type cols[numEntToModify];
scalar_type vals[numEntToModify];
//MD 08/2017 Note: Adding below initialization
//as this creates a warning where cols might be unitialized.
for (ordinal_type k = 0; k < numEntToModify; ++k) {
cols[k] = 0;
vals[k] = 0;
}

// Indices A.numCols() - 1, A.numCols() - 2, ..., 0 always exist
// in the row, given how we construct the matrix. We put them
Expand Down Expand Up @@ -115,16 +121,6 @@ namespace Test {
// << std::endl;
}

//MD 08/2017 Note: Adding below initialization
//as this creates a warning where cols might be unitialized.
for (ordinal_type k = ncol; k < numEntToModify; ++k) {
if (k - 1 >= 0){
cols[k] = cols[0];
}
else {
cols[k] = cols[0];
}
}

ordinal_type lclNumModified = 0;
if (replace_) {
Expand Down
37 changes: 31 additions & 6 deletions unit_test/sparse/Test_Sparse_spgemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,18 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){
lno_nnz_view_t h_ent2 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries2);
scalar_view_t h_vals2 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals2);

if (nrows1 != nrows2) return false;
if (nentries1 != nentries2) return false;
if (nvals1 != nvals2) return false;
if (nrows1 != nrows2) {
std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl;
return false;
}
if (nentries1 != nentries2) {
std::cout << "nentries1:" << nentries1 << " nentries2:" << nentries2 << std::endl;
return false;
}
if (nvals1 != nvals2) {
std::cout << "nvals1:" << nvals1 << " nvals2:" << nvals2 << std::endl;
return false;
}

KokkosKernels::Impl::kk_sort_graph
<typename graph_t::row_map_type,
Expand All @@ -213,13 +222,23 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){
typename device::execution_space>(output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
//KokkosKernels::Impl::kk_print_1Dview(output_mat2.graph.row_map);

if (!is_identical) return false;
if (!is_identical) {
std::cout << "rowmaps are different." << std::endl;
KokkosKernels::Impl::kk_print_1Dview(output_mat1.graph.row_map);
KokkosKernels::Impl::kk_print_1Dview(output_mat2.graph.row_map);
return false;
}

is_identical = KokkosKernels::Impl::kk_is_identical_view
<lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
typename device::execution_space>(h_ent1, h_ent2, 0 );

if (!is_identical) return false;
if (!is_identical) {
std::cout << "entries are different." << std::endl;
KokkosKernels::Impl::kk_print_1Dview(h_ent1);
KokkosKernels::Impl::kk_print_1Dview(h_ent2);
return false;
}


typedef typename Kokkos::Details::ArithTraits<typename scalar_view_t::non_const_value_type>::mag_type eps_type;
Expand All @@ -230,7 +249,13 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){
<scalar_view_t, scalar_view_t, eps_type,
typename device::execution_space>(h_vals1, h_vals2, eps);

if (!is_identical) return false;
if (!is_identical) {
std::cout << "values are different." << std::endl;
KokkosKernels::Impl::kk_print_1Dview(output_mat1.values);
KokkosKernels::Impl::kk_print_1Dview(output_mat2.values);

return false;
}
return true;
}
}
Expand Down

0 comments on commit e323bcb

Please sign in to comment.