updated code

HipGraph · Oct 21, 2021 · 144aaf6 · 144aaf6
1 parent ec7f7d8
commit 144aaf6
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,48 @@
-# MarkovGNNR
+# MarkovGNN
+MarkovGNN: Graph Neural Networks using Markov Diffusion. This repository is only for WWW2022 submission.
 
+## System requirements
+Users will need to install the following tools (CPU version).
+```
+PyTorch: 1.7.0
+PyTorch-Geometric 1.6.1
+PyTorchSparse 0.6.8
+PyTorch Scatter 2.0.5
+PyTorch Cluster 1.5.8
+PyTorch Spline Conv 1.2.0
+NetworkX: 2.2
+scikit-learn: 0.23.2
+Matplotlib: 3.0.3
+```
 
 ## How to run
-A sample of command to run GNN models. More will be added.
+A list of sample commands to run the MarkovGCN models.
 ```
-python main.py --edgelist datasets/input2f/email.edgelist --label datasets/input2f/email.nodes.labels
-python main.py --edgelist datasets/input2f/usaairports.edgelist --label datasets/input2f/usaairports.nodes.labels --oneindexed 1
-python main.py --edgelist datasets/input2f/yeast.edgelist --label datasets/input2f/yeast.nodes.labels --oneindexed 1 --onelabeled 1
-python main.py --edgelist datasets/input3f/squirrel_edges.txt --label datasets/input3f/squirrel_labels.txt --feature datasets/input3f/squirrel_features.txt
+python main.py --edgelist datasets/input2f/email.edgelist --label datasets/input2f/email.nodes.labels --eps 0.005 --epoch 200 --alpha 0.1 --nlayers 3
+
+python main.py --edgelist datasets/input2f/usaairports.edgelist --label datasets/input2f/usaairports.nodes.labels --oneindexed 1 --epoch 200 --alpha 1.0 --eps 0.09 --lrate 0.01 --nlayers 4 --normrow 0 --inflate 1.5 --markov_agg
+
+python main.py --edgelist datasets/input2f/yeast.edgelist --label datasets/input2f/yeast.nodes.labels --oneindexed 1 --onelabeled 1 --eps 0.25 --epoch 200 --inflate 1.5 --lrate 0.05 --alpha 0.7 --markov_agg --nlayers 3
+
+python main.py --edgelist datasets/input3f/chameleon_edges.txt --label datasets/input3f/chameleon_labels.txt --feature datasets/input3f/chameleon_features.txt --epoch 200 --alpha 0.2 --nlayers 2 --eps 0.06 --inflate 1.8 --droprate 0.7 --markov_agg
+
+python main.py --edgelist datasets/input3f/squirrel_edges.txt --label datasets/input3f/squirrel_labels.txt --feature datasets/input3f/squirrel_features.txt --epoch 200 --eps 0.05 --droprate 0.25 --markov_agg --nlayers 6 --markov_agg
 
+python main.py --eps 0.03 --droprate 0.85 --epoch 300 --alpha 0.05 --nlayers 2 --lrate 0.005 --inflate 1.8 --markov_agg
 
+python main.py --eps 0.03 --droprate 0.85 --epoch 300 --alpha 0.05 --nlayers 2 --lrate 0.001 --inflate 3.5 --markov_agg --dataset Citeseer
+
+python main.py --edgelist datasets/input3f/actor_edges.txt --label datasets/input3f/actor_labels.txt --feature datasets/input3f/actor_features.txt --epoch 200  --alpha 0.4 --markov_agg --nlayers 4
+
+python main.py --edgelist datasets/input3f/actor_edges.txt --label datasets/input3f/actor_labels.txt --feature datasets/input3f/actor_features.txt --epoch 200  --alpha 0.8 --markov_agg --nlayers 5
+```
+
+## Parameters
+There are several options to run the method which are outlined in the main.py file.
+```
+--markov_dense -> markov process uses dense matrix multiplication (sparse matrix multiplicaiton is the default option)
+--markov_agg -> i-th layer uses a markov matrix from i-th iteration, this option with higher threshold will produce better runtime
+--use_gcn -> use vanilla GCN model
 ```
+
+Please create an issue if you face any problem to run this method. We hope to respond anonymously.
diff --git a/main.py b/main.py
@@ -67,7 +67,7 @@ def helper(data, args):
                 (edge_index, edge_weight) = markov_process_disj_sparse(data, eps, inflate, nlayers, normrow == 1, args.debug == 1)
         if False:
             print("layer-wise edge shape", edge_index)
-        model = MarkovGCNR(ndim, nlayers, len(set(data.y.tolist())), data.x, edge_index, edge_weight, droprate, useleakyrelu==1, alpha)
+        model = MarkovGCNR(ndim, nlayers, len(set(data.y.tolist())), data.x, edge_index, edge_weight, droprate, useleakyrelu==1, alpha, args.addbias == 1)
     #define an optimizer
     optimizerdict = []
     for l in range(nlayers-1):
@@ -99,7 +99,8 @@ def helper(data, args):
     parser.add_argument('--droprate', default = 0.5, required = False, type = float, help = 'Dropout  Rate')
     parser.add_argument('--epoch', default = 100, required=False, type=int, help='Number of epoch.')
     parser.add_argument('--debug', default = 1, required=False, type=int, help='Disable debug mode.')
-
+    parser.add_argument('--addbias', default = 1, required=False, type=int, help='Add bias.')
+
     args = parser.parse_args()
     edgelistf = args.edgelist
     labelf = args.label

diff --git a/markov.py b/markov.py
@@ -7,8 +7,9 @@
 from torch_sparse import spspmm
 from torch_geometric.utils import add_remaining_self_loops, to_dense_adj, dense_to_sparse
 from torch_scatter import scatter_add
-from utils import computeHomophily, newEdges
+from utils import computeHomophily, newEdges, mixingCommunityScore
 
+debug_on = False
 
 def markov_normalization(edge_index, edge_weight, num_nodes, ntype = 'col'):
     if ntype == 'col':
@@ -59,8 +60,9 @@ def markov_process_agg_sparse(data, eps, inflate, nlayers, row_normalization = T
         # store layer-wise edges
         medge_index.append(ei)
         medge_weight.append(ew)
-        if debug:
+        if debug_on:
             print("layer ", i+1, "(after sparsification) edge_index size:", ei.shape, "homophily:", computeHomophily(data, ei))
+            print("Community Mixing Param:", mixingCommunityScore(data, ei), "New Edges:", newEdges(data, ei))
     if nlayers > len(medge_index):
         print("Use less number of layers for the given", eps, " threshold, maximum:", len(medge_index), "layers")
         sys.exit(1)
@@ -85,7 +87,7 @@ def markov_process_agg(data, eps, inflate, nlayers, row_normalization = True, ke
         A = torch.mm(A, A)
         A = torch.pow(A, inflate)
         (ei, ew) = dense_to_sparse(A)
-        if debug:
+        if debug_on:
             print("layer ", i+1, " (after mul and pow) edge_index size:", ei.shape)
         # normalization
         if row_normalization:
@@ -119,7 +121,7 @@ def markov_process_agg(data, eps, inflate, nlayers, row_normalization = True, ke
         else:
             edge_index2, edge_weight2 = markov_normalization(ei, ew, A.shape[0], 'col')
         A = to_dense_adj(edge_index = edge_index2, batch = None, edge_attr = edge_weight2, max_num_nodes = int(data.x.shape[0]))[0]
-        if debug:
+        if debug_on:
             print("layer ", i+1, "(after sparsification) edge_index size:", edge_index2.shape)
         medge_index.append(edge_index2)
         medge_weight.append(edge_weight2)
@@ -167,7 +169,7 @@ def markov_process_disj_sparse(data, eps, inflate, nlayers, row_normalization =
         # store layer-wise edges
         medge_index.append(ei)
         medge_weight.append(ew)
-        if debug:
+        if debug_on:
             print("layer ", i+1, "(after sparsification) edge_index size:", ei.shape)
         if ei[0].shape == prev_edge_index[0].shape:
             print("early stopping markov process due to converged number of edges.")
@@ -212,7 +214,7 @@ def markov_process_disj(data, eps, inflate, nlayers, row_normalization = True, k
         A = torch.mm(A, A)
         A = torch.pow(A, inflate)
         (ei, ew) = dense_to_sparse(A)
-        if debug:
+        if debug_on:
             print("layer ", i+1, " (after mul and pow) edge_index size:", ei.shape)
         # normalization
         if row_normalization:
@@ -246,7 +248,7 @@ def markov_process_disj(data, eps, inflate, nlayers, row_normalization = True, k
         else:
             edge_index2, edge_weight2 = markov_normalization(ei, ew, A.shape[0], 'col')
         A = to_dense_adj(edge_index = edge_index2, batch = None, edge_attr = edge_weight2, max_num_nodes = int(data.x.shape[0]))[0]
-        if debug:
+        if debug_on:
             print("layer ", i+1, "(after sparsification) edge_index size:", edge_index2.shape)
         medge_index.append(edge_index2)
         medge_weight.append(edge_weight2)

diff --git a/models.py b/models.py
@@ -9,7 +9,7 @@
 import community as comm
 
 class MarkovGCNR(torch.nn.Module):
-    def __init__(self, ndim, nlayers, ntargets, features, edges, weights = None, droprate = 0.5, useleakyrelu = False, alpha = 0.5):
+    def __init__(self, ndim, nlayers, ntargets, features, edges, weights = None, droprate = 0.5, useleakyrelu = False, alpha = 0.5, addbias = True):
         super(MarkovGCNR, self).__init__()
         self.convs = []
         self.ndim = ndim
@@ -21,10 +21,10 @@ def __init__(self, ndim, nlayers, ntargets, features, edges, weights = None, dro
         self.droprate = droprate
         self.useleakyrelu = useleakyrelu
         self.alpha = alpha
-        self.convs.append(GCNConv(self.features.shape[1], self.ndim, cached=True))
+        self.convs.append(GCNConv(self.features.shape[1], self.ndim, cached=True, bias = addbias))
         for l in range(nlayers-2):
-            self.convs.append(GCNConv(self.ndim, self.ndim, cached=True))
-        self.convs.append(GCNConv(self.ndim, self.ntargets, cached=True))
+            self.convs.append(GCNConv(self.ndim, self.ndim, cached=True, bias = addbias))
+        self.convs.append(GCNConv(self.ndim, self.ntargets, cached=True, bias = addbias))
 
     def forward(self):
         assert len(self.edges) == self.nlayers
@@ -53,7 +53,7 @@ def inference(self):
         return xs
 
 class GCN(torch.nn.Module):
-    def __init__(self, ndim, nlayers, ntargets, features, edges, weights = None, droprate = 0.5, alpha = 0.5):
+    def __init__(self, ndim, nlayers, ntargets, features, edges, weights = None, droprate = 0.5, alpha = 0.5, addbias = True):
         super(GCN, self).__init__()
         self.convs = []
         self.ndim = ndim
@@ -63,10 +63,10 @@ def __init__(self, ndim, nlayers, ntargets, features, edges, weights = None, dro
         self.ntargets = ntargets
         self.features = features
         self.droprate = droprate
-        self.convs.append(GCNConv(self.features.shape[1], self.ndim, cached=True))
+        self.convs.append(GCNConv(self.features.shape[1], self.ndim, cached=True, bias = addbias))
         for l in range(self.nlayers-2):
-            self.convs.append(GCNConv(self.ndim, self.ndim, cached=True))
-        self.convs.append(GCNConv(self.ndim, self.ntargets, cached=True))
+            self.convs.append(GCNConv(self.ndim, self.ndim, cached=True, bias = addbias))
+        self.convs.append(GCNConv(self.ndim, self.ntargets, cached=True, bias = addbias))
 
     def forward(self):
         x = F.dropout(self.features, p=self.droprate, training=self.training)

diff --git a/utils.py b/utils.py
@@ -8,6 +8,7 @@
 import torch
 from networkx.algorithms import community
 import networkx as nx
+from torch_geometric.datasets import Amazon
 
 def readInput3f(inputf, labelf, featuref, oneIndexed = False, onelabeled = False, debug = True):
     # inputf: input file name with path
@@ -177,7 +178,10 @@ def readInput2f(inputf, labelf, oneIndexed = False, onelabeled = False, debug =
 
 def loadPyGDataset(dataset_name = 'Cora'):
     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset_name)
-    dataset = Planetoid(path, dataset_name, num_train_per_class=30, transform=T.NormalizeFeatures())
+    if dataset_name.lower() in ('cora', 'citeseer', 'pubmed'):
+        dataset = Planetoid(path, dataset_name, num_train_per_class=30, transform=T.NormalizeFeatures())
+    else:
+        dataset = Amazon(path, dataset_name, transform=T.NormalizeFeatures())
     data = dataset[0]
     return data
 
@@ -193,18 +197,22 @@ def computeHomophily(data, ei = None):
     return nominator / len(edges)
 
 # compute community mixing
-def mixingCommunityScore(data):
-    G = nx.Graph(data.edge_index.t().tolist())
+def mixingCommunityScore(data, ei = None):
+    if ei is None:
+        edges = data.edge_index
+    else:
+        edges = ei
+    G = nx.Graph(edges.t().tolist())
     comm = community.greedy_modularity_communities(G)
     print("#communities detected:", len(comm))
     gd = dict()
     for com in range(len(comm)):
         for node in list(comm[com]):
             gd[node] = com
     count = 0
-    for edge in data.edge_index.t():
+    for edge in edges.t():
         count += gd[edge[0].item()] != gd[edge[1].item()]
-    return count / len(data.edge_index.t())
+    return count / len(edges.t())
 
 # compute new edges percentage
 def newEdges(data, edges):