Skip to content

Commit

Permalink
[Kernel] Matrix toSimple (dmlc#1756)
Browse files Browse the repository at this point in the history
* Matrix to simple

* Pass test

* new impl

* Fix test

* Fix lint

* trigger

* upd

* Fix comments

Co-authored-by: Ubuntu <[email protected]>
  • Loading branch information
classicsong and Ubuntu authored Jul 9, 2020
1 parent 167216a commit 29e6c93
Show file tree
Hide file tree
Showing 11 changed files with 442 additions and 55 deletions.
25 changes: 25 additions & 0 deletions include/dgl/aten/coo.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <dmlc/serializer.h>
#include <vector>
#include <utility>
#include <tuple>
#include "./types.h"
#include "./array_ops.h"
#include "./spmat.h"
Expand Down Expand Up @@ -406,6 +407,30 @@ COOMatrix COORowWiseTopk(
COOMatrix DisjointUnionCoo(
const std::vector<COOMatrix>& coos);

/*!
* \brief COOMatrix toSimple.
*
* A = [[0, 0, 0],
* [3, 0, 2],
* [1, 1, 0],
* [0, 0, 4]]
*
* B, cnt, edge_map = COOToSimple(A)
*
* B = [[0, 0, 0],
* [1, 0, 1],
* [1, 1, 0],
* [0, 0, 1]]
* cnt = [3, 2, 1, 1, 4]
* edge_map = [0, 0, 0, 1, 1, 2, 3, 4, 4, 4, 4]
*
* \return The simplified COOMatrix
* The count recording the number of duplicated edges from the original graph.
* The edge mapping from the edge IDs of original graph to those of the
* returned graph.
*/
std::tuple<COOMatrix, IdArray, IdArray> COOToSimple(const COOMatrix& coo);

/*!
* \brief Split a COOMatrix into multiple disjoin components.
*
Expand Down
25 changes: 25 additions & 0 deletions include/dgl/aten/csr.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <dmlc/io.h>
#include <dmlc/serializer.h>
#include <vector>
#include <tuple>
#include "./types.h"
#include "./array_ops.h"
#include "./spmat.h"
Expand Down Expand Up @@ -401,6 +402,30 @@ COOMatrix CSRRowWiseTopk(
CSRMatrix DisjointUnionCsr(
const std::vector<CSRMatrix>& csrs);

/*!
* \brief CSRMatrix toSimple.
*
* A = [[0, 0, 0],
* [3, 0, 2],
* [1, 1, 0],
* [0, 0, 4]]
*
* B, cnt, edge_map = CSRToSimple(A)
*
* B = [[0, 0, 0],
* [1, 0, 1],
* [1, 1, 0],
* [0, 0, 1]]
* cnt = [3, 2, 1, 1, 4]
* edge_map = [0, 0, 0, 1, 1, 2, 3, 4, 4, 4, 4]
*
* \return The simplified CSRMatrix
* The count recording the number of duplicated edges from the original graph.
* The edge mapping from the edge IDs of original graph to those of the
* returned graph.
*/
std::tuple<CSRMatrix, IdArray, IdArray> CSRToSimple(const CSRMatrix& csr);

/*!
* \brief Split a CSRMatrix into multiple disjoin components.
*
Expand Down
4 changes: 3 additions & 1 deletion python/dgl/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,6 +1214,8 @@ def to_simple(g, return_counts='count', writeback_mapping=None):
This function does not preserve node and edge features.
TODO(xiangsx): Don't save writeback_mapping into g, but put it into return value.
Parameters
----------
g : DGLHeteroGraph
Expand All @@ -1234,7 +1236,7 @@ def to_simple(g, return_counts='count', writeback_mapping=None):
Examples
--------
Consider the following graph
>>> g = dgl.graph([(0, 1), (1, 3), (2, 2), (1, 3), (1, 4), (1, 4)])
>>> g = dgl.graph(([0, 1, 2, 1, 1, 1], [1, 3, 2, 3, 4, 4]))
>>> sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')
The returned graph would have duplicate edges connecting (1, 3) and (1, 4) removed:
Expand Down
57 changes: 57 additions & 0 deletions src/array/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,17 @@ COOMatrix CSRRowWiseTopk(
return ret;
}

std::tuple<CSRMatrix, IdArray, IdArray>
CSRToSimple(const CSRMatrix& csr) {
std::tuple<CSRMatrix, IdArray, IdArray> ret;

CSRMatrix sorted_csr = (CSRIsSorted(csr)) ? csr : CSRSort(csr);
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRToSimple", {
ret = impl::CSRToSimple<XPU, IdType>(sorted_csr);
});
return ret;
}

///////////////////////// COO routines //////////////////////////

bool COOIsNonZero(COOMatrix coo, int64_t row, int64_t col) {
Expand Down Expand Up @@ -678,6 +689,52 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {
return ret;
}

std::tuple<COOMatrix, IdArray, IdArray>
COOToSimple(const COOMatrix& coo) {
// coo column sorted
const COOMatrix sorted_coo = COOSort(coo, true);
const IdArray eids_shuffled = COOHasData(sorted_coo) ?
sorted_coo.data :
Range(0, sorted_coo.row->shape[0], sorted_coo.row->dtype.bits, sorted_coo.row->ctx);
const auto &coalesced_result = COOCoalesce(sorted_coo);
const COOMatrix &coalesced_adj = coalesced_result.first;
const IdArray &count = coalesced_result.second;

/*
* eids_shuffled actually already contains the mapping from old edge space to the
* new one:
*
* * eids_shuffled[0:count[0]] indicates the original edge IDs that coalesced into new
* edge #0.
* * eids_shuffled[count[0]:count[0] + count[1]] indicates those that coalesced into
* new edge #1.
* * eids_shuffled[count[0] + count[1]:count[0] + count[1] + count[2]] indicates those
* that coalesced into new edge #2.
* * etc.
*
* Here, we need to translate eids_shuffled to an array "eids_remapped" such that
* eids_remapped[i] indicates the new edge ID the old edge #i is mapped to. The
* translation can simply be achieved by (in numpy code):
*
* new_eid_for_eids_shuffled = np.range(len(count)).repeat(count)
* eids_remapped = np.zeros_like(new_eid_for_eids_shuffled)
* eids_remapped[eids_shuffled] = new_eid_for_eids_shuffled
*/
const IdArray new_eids = Range(
0, coalesced_adj.row->shape[0], coalesced_adj.row->dtype.bits, coalesced_adj.row->ctx);
const IdArray eids_remapped = Scatter(Repeat(new_eids, count), eids_shuffled);

COOMatrix ret = COOMatrix(
coalesced_adj.num_rows,
coalesced_adj.num_cols,
coalesced_adj.row,
coalesced_adj.col,
NullArray(),
true,
true);
return std::make_tuple(ret, count, eids_remapped);
}

///////////////////////// Graph Traverse routines //////////////////////////
Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
Frontiers ret;
Expand Down
2 changes: 2 additions & 0 deletions src/array/array_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix CSRRowWiseTopk(
CSRMatrix mat, IdArray rows, int64_t k, NDArray weight, bool ascending);

template <DLDeviceType XPU, typename IdType>
std::tuple<CSRMatrix, IdArray, IdArray> CSRToSimple(CSRMatrix csr);

///////////////////////////////////////////////////////////////////////////////////////////

Expand Down
75 changes: 75 additions & 0 deletions src/array/cpu/csr_to_simple.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/csr_to_simple.cc
* \brief CSR sorting
*/
#include <dgl/array.h>
#include <numeric>
#include <algorithm>
#include <vector>

namespace dgl {
namespace aten {
namespace impl {

template <DLDeviceType XPU, typename IdType>
std::tuple<CSRMatrix, IdArray, IdArray> CSRToSimple(CSRMatrix csr) {
if (!csr.sorted)
csr = CSRSort(csr);

const IdType *indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType *indices_data = static_cast<IdType*>(csr.indices->data);

std::vector<IdType> indptr;
std::vector<IdType> indices;
std::vector<IdType> count;
indptr.resize(csr.indptr->shape[0]);
indptr[0] = 0;

for (int64_t i = 1; i < csr.indptr->shape[0]; ++i) {
if (indptr_data[i-1] == indptr_data[i]) {
indptr[i] = indptr[i-1];
continue;
}

int64_t cnt = 1;
int64_t dup_cnt = 1;
indices.push_back(indices_data[indptr_data[i-1]]);
for (int64_t j = indptr_data[i-1]+1; j < indptr_data[i]; ++j) {
if (indices_data[j-1] == indices_data[j]) {
++dup_cnt;
continue;
}
count.push_back(dup_cnt);
dup_cnt = 1;
indices.push_back(indices_data[j]);
++cnt;
}
count.push_back(dup_cnt);
indptr[i] = indptr[i-1] + cnt;
}

CSRMatrix res_csr = CSRMatrix(
csr.num_rows,
csr.num_cols,
IdArray::FromVector(indptr),
IdArray::FromVector(indices),
NullArray(),
true);

const IdArray &edge_count = IdArray::FromVector(count);
const IdArray new_eids = Range(
0, res_csr.indices->shape[0], sizeof(IdType) * 8, csr.indptr->ctx);
const IdArray eids_remapped = CSRHasData(csr) ?
Scatter(Repeat(new_eids, edge_count), csr.data) :
Repeat(new_eids, edge_count);

return std::make_tuple(res_csr, edge_count, eids_remapped);
}

template std::tuple<CSRMatrix, IdArray, IdArray> CSRToSimple<kDLCPU, int32_t>(CSRMatrix);
template std::tuple<CSRMatrix, IdArray, IdArray> CSRToSimple<kDLCPU, int64_t>(CSRMatrix);

} // namespace impl
} // namespace aten
} // namespace dgl
44 changes: 4 additions & 40 deletions src/graph/transform/to_simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <dgl/packed_func_ext.h>
#include <vector>
#include <utility>
#include "../heterograph.h"
#include "../unit_graph.h"
#include "../../c_api_common.h"

Expand All @@ -24,51 +25,14 @@ std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
ToSimpleGraph(const HeteroGraphPtr graph) {
const int64_t num_etypes = graph->NumEdgeTypes();
const auto metagraph = graph->meta_graph();
const auto &ugs = std::dynamic_pointer_cast<HeteroGraph>(graph)->relation_graphs();

std::vector<IdArray> counts(num_etypes), edge_maps(num_etypes);
std::vector<HeteroGraphPtr> rel_graphs(num_etypes);

for (int64_t etype = 0; etype < num_etypes; ++etype) {
const auto vtypes = graph->GetEndpointTypes(etype);
const COOMatrix adj = graph->GetCOOMatrix(etype);
const COOMatrix sorted_adj = COOSort(adj, true);
const IdArray eids_shuffled = sorted_adj.data;
const auto &coalesced_result = COOCoalesce(sorted_adj);
const COOMatrix &coalesced_adj = coalesced_result.first;
const IdArray &count = coalesced_result.second;

/*
* eids_shuffled actually already contains the mapping from old edge space to the
* new one:
*
* * eids_shuffled[0:count[0]] indicates the original edge IDs that coalesced into new
* edge #0.
* * eids_shuffled[count[0]:count[0] + count[1]] indicates those that coalesced into
* new edge #1.
* * eids_shuffled[count[0] + count[1]:count[0] + count[1] + count[2]] indicates those
* that coalesced into new edge #2.
* * etc.
*
* Here, we need to translate eids_shuffled to an array "eids_remapped" such that
* eids_remapped[i] indicates the new edge ID the old edge #i is mapped to. The
* translation can simply be achieved by (in numpy code):
*
* new_eid_for_eids_shuffled = np.range(len(count)).repeat(count)
* eids_remapped = np.zeros_like(new_eid_for_eids_shuffled)
* eids_remapped[eids_shuffled] = new_eid_for_eids_shuffled
*/
const IdArray new_eids = Range(
0, coalesced_adj.row->shape[0], coalesced_adj.row->dtype.bits, coalesced_adj.row->ctx);
const IdArray eids_remapped = Scatter(Repeat(new_eids, count), eids_shuffled);

edge_maps[etype] = eids_remapped;
counts[etype] = count;
rel_graphs[etype] = UnitGraph::CreateFromCOO(
vtypes.first == vtypes.second ? 1 : 2,
coalesced_adj.num_rows,
coalesced_adj.num_cols,
coalesced_adj.row,
coalesced_adj.col);
const auto result = ugs[etype]->ToSimple();
std::tie(rel_graphs[etype], counts[etype], edge_maps[etype]) = result;
}

const HeteroGraphPtr result = CreateHeteroGraph(
Expand Down
40 changes: 40 additions & 0 deletions src/graph/unit_graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1571,4 +1571,44 @@ UnitGraphPtr UnitGraph::Reverse() const {
return UnitGraphPtr(new UnitGraph(meta_graph(), new_incsr, new_outcsr, new_coo));
}

std::tuple<UnitGraphPtr, IdArray, IdArray>
UnitGraph::ToSimple() const {
CSRPtr new_incsr = nullptr, new_outcsr = nullptr;
COOPtr new_coo = nullptr;
IdArray count;
IdArray edge_map;

auto avail_fmt = SelectFormat(SparseFormat::kAny);
switch (avail_fmt) {
case SparseFormat::kCOO: {
auto ret = aten::COOToSimple(coo_->adj());
count = std::get<1>(ret);
edge_map = std::get<2>(ret);
new_coo = COOPtr(new COO(coo_->meta_graph(), std::get<0>(ret)));
break;
}
case SparseFormat::kCSR: {
auto ret = aten::CSRToSimple(in_csr_->adj());
count = std::get<1>(ret);
edge_map = std::get<2>(ret);
new_incsr = CSRPtr(new CSR(in_csr_->meta_graph(), std::get<0>(ret)));
break;
}
case SparseFormat::kCSC: {
auto ret = aten::CSRToSimple(out_csr_->adj());
count = std::get<1>(ret);
edge_map = std::get<2>(ret);
new_outcsr = CSRPtr(new CSR(out_csr_->meta_graph(), std::get<0>(ret)));
break;
}
default:
LOG(FATAL) << "At lease one of COO, CSR or CSC adj should exist.";
break;
}

return std::make_tuple(UnitGraphPtr(new UnitGraph(meta_graph(), new_incsr, new_outcsr, new_coo)),
count,
edge_map);
}

} // namespace dgl
8 changes: 8 additions & 0 deletions src/graph/unit_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <string>
#include <vector>
#include <memory>
#include <tuple>

#include "../c_api_common.h"

Expand Down Expand Up @@ -269,6 +270,13 @@ class UnitGraph : public BaseHeteroGraph {
/*! \return the reversed graph */
UnitGraphPtr Reverse() const;

/*! \return the simpled (no-multi-edge) graph
* the count recording the number of duplicated edges from the original graph.
* the edge mapping from the edge IDs of original graph to those of the
* returned graph.
*/
std::tuple<UnitGraphPtr, IdArray, IdArray>ToSimple() const;

private:
friend class Serializer;
friend class HeteroGraph;
Expand Down
6 changes: 3 additions & 3 deletions tests/compute/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,10 +721,10 @@ def test_cast():
# test_remove_self_loop()
# test_add_self_loop()
# test_partition_with_halo()
test_metis_partition()
# test_metis_partition()
# test_compact()
# test_to_simple()
test_to_simple("int32")
# test_in_subgraph("int32")
# test_out_subgraph()
test_to_block("int32")
# test_to_block("int32")
# test_remove_edges()
Loading

0 comments on commit 29e6c93

Please sign in to comment.