Skip to content

Commit

Permalink
[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted…
Browse files Browse the repository at this point in the history
… COO/CSR (dmlc#1704)

* add cub; array cumsum

* CSRSliceRows

* fix warning

* operator << for ndarray; CSRSliceRows

* add CSRIsSorted

* add csr_sort

* inplace coosort and outplace csrsort

* WIP: coo is sorted

* mv cuda_utils

* add AllTrue utility

* csr sort

* coo sort

* coo2csr for sorted coo arrays

* CSRToCOO from sorted

* pass tests for the new kernel changes

* cannot use inplace sort

* lint

* try fix msvc error

* Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC

* stash

* revert some hack

* revert some changes

* address comments

* fix

* fix to_block unittest

* add todo note
  • Loading branch information
jermainewang authored Jun 28, 2020
1 parent da8632c commit 870da74
Show file tree
Hide file tree
Showing 59 changed files with 1,364 additions and 426 deletions.
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
[submodule "third_party/METIS"]
path = third_party/METIS
url = https://github.com/KarypisLab/METIS.git
[submodule "third_party/cub"]
path = third_party/cub
url = https://github.com/NVlabs/cub.git
branch = 1.8.0
[submodule "third_party/phmap"]
path = third_party/phmap
url = https://github.com/greg7mdp/parallel-hashmap.git
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
include_directories("third_party/dmlc-core/include")
include_directories("third_party/minigun/minigun")
include_directories("third_party/minigun/third_party/moderngpu/src")
include_directories("third_party/cub/")
include_directories("third_party/phmap/")

# initial variables
set(DGL_LINKER_LIBS "")
Expand Down
33 changes: 32 additions & 1 deletion include/dgl/aten/array_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <utility>
#include <vector>
#include <tuple>
#include <string>
#include "./types.h"

namespace dgl {
Expand Down Expand Up @@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
* \tparam ValueType The type of return value.
*/
template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index);
ValueType IndexSelect(NDArray array, int64_t index);

/*!
* \brief Return the data under the index. In numpy notation, A[I]
*/
NDArray IndexSelect(NDArray array, IdArray index);

/*!
* \brief Return the data from `start` (inclusive) to `end` (exclusive).
*/
NDArray IndexSelect(NDArray array, int64_t start, int64_t end);

/*!
* \brief Permute the elements of an array according to given indices.
*
Expand Down Expand Up @@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
*/
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);

/*!
* \brief Return the cumulative summation (or inclusive sum) of the input array.
*
* The first element out[0] is equal to the first element of the input array
* array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
* Hence, the result array length is the same as the input array length.
*
* If prepend_zero is true, then the first element is zero and the result array
* length is the input array length plus one. This is useful for creating
* an indptr array over a count array.
*
* \param array The 1D input array.
* \return Array after cumsum.
*/
IdArray CumSum(IdArray array, bool prepend_zero = false);

/*!
* \brief Return a string that prints out some debug information.
*/
std::string ToDebugString(NDArray array);

// inline implementations
template <typename T>
IdArray VecToIdArray(const std::vector<T>& vec,
Expand Down
64 changes: 63 additions & 1 deletion include/dgl/aten/coo.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@ struct COOMatrix {
CHECK_NO_OVERFLOW(row->dtype, num_rows);
CHECK_NO_OVERFLOW(row->dtype, num_cols);
}

/*! \brief Return a copy of this matrix on the give device context. */
inline COOMatrix CopyTo(const DLContext& ctx) const {
if (ctx == row->ctx)
return *this;
return COOMatrix(num_rows, num_cols,
row.CopyTo(ctx), col.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
row_sorted, col_sorted);
}
};

///////////////////////// COO routines //////////////////////////
Expand All @@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
return !IsNullArray(csr.data);
}

/*!
* \brief Check whether the COO is sorted.
*
* It returns two flags: one for whether the row is sorted;
* the other for whether the columns of each row is sorted
* if the first flag is true.
*
* Complexity: O(NNZ)
*/
std::pair<bool, bool> COOIsSorted(COOMatrix coo);

/*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);

Expand All @@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
* the result CSR matrix stores a shuffle index for how the entries
* will be reordered in CSR. The i^th entry in the result CSR corresponds
* to the CSR.data[i] th entry in the input COO.
*
* Conversion complexity: O(nnz)
*
* - The function first check whether the input COO matrix is sorted
* using a linear scan.
* - If the COO matrix is row sorted, the conversion can be done very
* efficiently in a sequential scan. The result indices and data arrays
* are directly equal to the column and data arrays from the input.
* - If the COO matrix is further column sorted, the result CSR is
* also column sorted.
* - Otherwise, the conversion is more costly but still is O(nnz).
*
* \param coo Input COO matrix.
* \return CSR matrix.
*/
CSRMatrix COOToCSR(COOMatrix coo);

Expand Down Expand Up @@ -195,18 +230,45 @@ bool COOHasDuplicate(COOMatrix coo);
*/
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);

/*!
* \brief Sort the indices of a COO matrix in-place.
*
* The function sorts row indices in ascending order. If sort_column is true,
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The coo matrix to sort.
* \param sort_column True if column index should be sorted too.
*/
void COOSort_(COOMatrix* mat, bool sort_column = false);

/*!
* \brief Sort the indices of a COO matrix.
*
* The function sorts row indices in ascending order. If sort_column is true,
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The input coo matrix
* \param sort_column True if column index should be sorted too.
* \return COO matrix with index sorted.
*/
COOMatrix COOSort(COOMatrix mat, bool sort_column = false);
inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
if ((mat.row_sorted && !sort_column) || mat.col_sorted)
return mat;
COOMatrix ret(mat.num_rows, mat.num_cols,
mat.row.Clone(), mat.col.Clone(),
COOHasData(mat)? mat.data.Clone() : mat.data,
mat.row_sorted, mat.col_sorted);
COOSort_(&ret, sort_column);
return ret;
}

/*!
* \brief Remove entries from COO matrix by entry indices (data indices)
Expand Down
56 changes: 52 additions & 4 deletions include/dgl/aten/csr.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,17 @@ struct CSRMatrix {
}
CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
CHECK_EQ(indptr->shape[0], num_rows + 1);
}

/*! \brief Return a copy of this matrix on the give device context. */
inline CSRMatrix CopyTo(const DLContext& ctx) const {
if (ctx == indptr->ctx)
return *this;
return CSRMatrix(num_rows, num_cols,
indptr.CopyTo(ctx), indices.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
sorted);
}
};

Expand Down Expand Up @@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
return !IsNullArray(csr.data);
}

/*! \brief Whether the column indices of each row is sorted. */
bool CSRIsSorted(CSRMatrix csr);

/* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
/*!
Expand All @@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);

/*!
* \brief Convert CSR matrix to COO matrix.
*
* Complexity: O(nnz)
*
* - If data_as_order is false, the column and data arrays of the
* result COO are equal to the indices and data arrays of the
* input CSR. The result COO is also row sorted.
* - If the input CSR is further sorted, the result COO is also
* column sorted.
*
* \param csr Input csr matrix
* \param data_as_order If true, the data array in the input csr matrix contains the order
* by which the resulting COO tuples are stored. In this case, the
Expand All @@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);

/*!
* \brief Slice rows of the given matrix and return.
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
*
* The sliced row IDs are relabeled to starting from zero.
*
* Examples:
* num_rows = 4
Expand All @@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
* num_cols = 4
* indptr = [0, 1, 1]
* indices = [2]
*
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
* \return sliced rows stored in a CSR matrix
*/
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
Expand All @@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
* In numpy notation, given matrix M, row index array I, col index array J
* This function returns the submatrix M[I, J].
*
* The sliced row and column IDs are relabeled to starting from zero.
*
* \param csr The input csr matrix
* \param rows The row index to select
* \param cols The col index to select
Expand All @@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
bool CSRHasDuplicate(CSRMatrix csr);

/*!
* \brief Sort the column index at each row in the ascending order.
* \brief Sort the column index at each row in ascending order in-place.
*
* Only the indices and data arrays (if available) will be mutated. The indptr array
* stays the same.
*
* Examples:
* num_rows = 4
Expand All @@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
*/
void CSRSort_(CSRMatrix* csr);

/*!
* \brief Sort the column index at each row in ascending order.
*
* Return a new CSR matrix with sorted column indices and data arrays.
*/
inline CSRMatrix CSRSort(CSRMatrix csr) {
if (csr.sorted)
return csr;
CSRMatrix ret(csr.num_rows, csr.num_cols,
csr.indptr, csr.indices.Clone(),
CSRHasData(csr)? csr.data.Clone() : csr.data,
csr.sorted);
CSRSort_(&ret);
return ret;
}

/*!
* \brief Reorder the rows and colmns according to the new row and column order.
* \param csr The input csr matrix.
Expand Down
4 changes: 4 additions & 0 deletions include/dgl/aten/macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,4 +252,8 @@
CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
} while (0);

#define CHECK_IS_ID_ARRAY(VAR) \
CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \
<< "Expected argument " << (#VAR) << " to be an 1D integer array.";

#endif // DGL_ATEN_MACRO_H_
1 change: 1 addition & 0 deletions include/dgl/graph_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <vector>
#include <utility>
#include <algorithm>
#include <memory>

#include "./runtime/object.h"
#include "array.h"
Expand Down
1 change: 1 addition & 0 deletions include/dgl/immutable_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <utility>
#include <tuple>
#include <algorithm>
#include <memory>
#include "runtime/ndarray.h"
#include "graph_interface.h"
#include "lazy.h"
Expand Down
1 change: 1 addition & 0 deletions include/dgl/nodeflow.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <vector>
#include <string>
#include <memory>

#include "./runtime/object.h"
#include "graph_interface.h"
Expand Down
13 changes: 13 additions & 0 deletions include/dgl/runtime/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string>
#include <utility>
#include <vector>
#include <memory>

#include "c_runtime_api.h"
#include "dlpack/dlpack.h"
Expand Down Expand Up @@ -157,6 +158,10 @@ class NDArray {
* \return The array under another context.
*/
inline NDArray CopyTo(const DLContext& ctx) const;
/*!
* \brief Return a new array with a copy of the content.
*/
inline NDArray Clone() const;
/*!
* \brief Load NDArray from stream
* \param stream The input data stream
Expand Down Expand Up @@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
return ret;
}

inline NDArray NDArray::Clone() const {
CHECK(data_ != nullptr);
const DLTensor* dptr = operator->();
return this->CopyTo(dptr->ctx);
}

inline int NDArray::use_count() const {
if (data_ == nullptr) return 0;
return data_->ref_counter_.load(std::memory_order_relaxed);
Expand Down Expand Up @@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);

std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);

///////////////// Operator overloading for DLDataType /////////////////

/*! \brief Check whether two data types are the same.*/
Expand Down
1 change: 1 addition & 0 deletions include/dgl/runtime/packed_func.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <string>
#include <limits>
#include <memory>
#include <utility>
#include <type_traits>
#include "c_runtime_api.h"
#include "module.h"
Expand Down
1 change: 1 addition & 0 deletions include/dgl/runtime/smart_ptr_serializer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <dgl/graph_serializer.h>
#include <dmlc/io.h>
#include <dmlc/serializer.h>
#include <memory>

namespace dmlc {
namespace serializer {
Expand Down
Loading

0 comments on commit 870da74

Please sign in to comment.