Added Reddit file + download option after Reddit broken link in PyG

alokpathy · alokpathy · commit c2c2e42db32e · 2020-10-15T03:24:13.000-04:00
diff --git a/gcn_distr.py b/gcn_distr.py
@@ -8,7 +8,8 @@
 import torch.distributed as dist
 
 from torch_geometric.data import Data, Dataset
-from torch_geometric.datasets import Planetoid, PPI, Reddit
+from torch_geometric.datasets import Planetoid, PPI
+from reddit import Reddit
 from torch_geometric.nn import GCNConv, ChebConv  # noqa
 from torch_geometric.utils import add_remaining_self_loops, to_dense_adj, dense_to_sparse, to_scipy_sparse_matrix
 import torch_geometric.transforms as T
@@ -60,6 +61,7 @@
 acc_per_rank = 0
 run_count = 0
 run = 0
+download = False
 
 def start_time(group, rank, subset=False, src=None):
     global barrier_time
@@ -675,21 +677,22 @@ def main():
     print(socket.gethostname())
     seed = 0
 
-    mp.set_start_method('spawn', force=True)
-    outputs = None
-    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
-    dist.init_process_group(backend='nccl')
-    rank = dist.get_rank()
-    size = dist.get_world_size()
-    print("Processes: " + str(size))
-
-    # device = torch.device('cpu')
-    devid = rank_to_devid(rank, acc_per_rank)
-    device = torch.device('cuda:{}'.format(devid))
-    torch.cuda.set_device(device)
-    curr_devid = torch.cuda.current_device()
-    # print(f"curr_devid: {curr_devid}", flush=True)
-    devcount = torch.cuda.device_count()
+    if not download:
+        mp.set_start_method('spawn', force=True)
+        outputs = None
+        os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+        dist.init_process_group(backend='nccl')
+        rank = dist.get_rank()
+        size = dist.get_world_size()
+        print("Processes: " + str(size))
+
+        # device = torch.device('cpu')
+        devid = rank_to_devid(rank, acc_per_rank)
+        device = torch.device('cuda:{}'.format(devid))
+        torch.cuda.set_device(device)
+        curr_devid = torch.cuda.current_device()
+        # print(f"curr_devid: {curr_devid}", flush=True)
+        devcount = torch.cuda.device_count()
 
     if graphname == "Cora":
         path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', graphname)
@@ -755,6 +758,9 @@ def main():
         inputs.requires_grad = True
         data.y = data.y.to(device)
 
+    if download:
+        exit()
+
     if normalization:
         adj_matrix, _ = add_remaining_self_loops(edge_index, num_nodes=inputs.size(0))
     else:
@@ -778,6 +784,7 @@ def main():
     parser.add_argument("--normalization", type=str)
     parser.add_argument("--activations", type=str)
     parser.add_argument("--accuracy", type=str)
+    parser.add_argument("--download", type=bool)
 
     args = parser.parse_args()
     print(args)
@@ -792,10 +799,12 @@ def main():
     normalization = args.normalization == "True"
     activations = args.activations == "True"
     accuracy = args.accuracy == "True"
+    download = args.download
 
-    if (epochs is None) or (graphname is None) or (timing is None) or (mid_layer is None) or (run_count is None):
-        print(f"Error: missing argument {epochs} {graphname} {timing} {mid_layer} {run_count}")
-        exit()
+    if not download:
+        if (epochs is None) or (graphname is None) or (timing is None) or (mid_layer is None) or (run_count is None):
+            print(f"Error: missing argument {epochs} {graphname} {timing} {mid_layer} {run_count}")
+            exit()
 
     print(f"Arguments: epochs: {epochs} graph: {graphname} timing: {timing} mid: {mid_layer} norm: {normalization} act: {activations} acc: {accuracy} runs: {run_count}")
     
diff --git a/gcn_distr_15d.py b/gcn_distr_15d.py
@@ -8,7 +8,8 @@
 import torch.distributed as dist
 
 from torch_geometric.data import Data, Dataset
-from torch_geometric.datasets import Planetoid, PPI, Reddit
+from torch_geometric.datasets import Planetoid, PPI
+from reddit import Reddit
 from torch_geometric.nn import GCNConv, ChebConv  # noqa
 from torch_geometric.utils import add_remaining_self_loops, to_dense_adj, dense_to_sparse, to_scipy_sparse_matrix
 import torch_geometric.transforms as T
@@ -61,6 +62,7 @@
 run_count = 0
 run = 0
 replication = 0
+download = False
 
 def start_time(group, rank, subset=False, src=None):
     global barrier_time
@@ -715,22 +717,23 @@ def main():
     print(socket.gethostname())
     seed = 0
 
-    mp.set_start_method('spawn', force=True)
-    outputs = None
-    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    if not download:
+        mp.set_start_method('spawn', force=True)
+        outputs = None
+        os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
 
-    dist.init_process_group(backend='nccl')
-    rank = dist.get_rank()
-    size = dist.get_world_size()
-    print("Processes: " + str(size))
+        dist.init_process_group(backend='nccl')
+        rank = dist.get_rank()
+        size = dist.get_world_size()
+        print("Processes: " + str(size))
 
-    # device = torch.device('cpu')
-    devid = rank_to_devid(rank, acc_per_rank)
-    device = torch.device('cuda:{}'.format(devid))
-    torch.cuda.set_device(device)
-    curr_devid = torch.cuda.current_device()
-    # print(f"curr_devid: {curr_devid}", flush=True)
-    devcount = torch.cuda.device_count()
+        # device = torch.device('cpu')
+        devid = rank_to_devid(rank, acc_per_rank)
+        device = torch.device('cuda:{}'.format(devid))
+        torch.cuda.set_device(device)
+        curr_devid = torch.cuda.current_device()
+        # print(f"curr_devid: {curr_devid}", flush=True)
+        devcount = torch.cuda.device_count()
 
     if graphname == "Cora":
         path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', graphname)
@@ -798,6 +801,9 @@ def main():
         inputs.requires_grad = True
         data.y = data.y.to(device)
 
+    if download:
+        exit()
+
     if normalization:
         adj_matrix, _ = add_remaining_self_loops(edge_index, num_nodes=inputs.size(0))
     else:
@@ -822,6 +828,7 @@ def main():
     parser.add_argument("--normalization", type=str)
     parser.add_argument("--activations", type=str)
     parser.add_argument("--accuracy", type=str)
+    parser.add_argument("--download", type=bool)
 
     args = parser.parse_args()
     print(args)
@@ -837,10 +844,12 @@ def main():
     activations = args.activations == "True"
     accuracy = args.accuracy == "True"
     replication = args.replication
+    download = args.download
 
-    if (epochs is None) or (graphname is None) or (timing is None) or (mid_layer is None) or (run_count is None):
-        print(f"Error: missing argument {epochs} {graphname} {timing} {mid_layer} {run_count}")
-        exit()
+    if not download:
+        if (epochs is None) or (graphname is None) or (timing is None) or (mid_layer is None) or (run_count is None):
+            print(f"Error: missing argument {epochs} {graphname} {timing} {mid_layer} {run_count}")
+            exit()
 
     print(f"Arguments: epochs: {epochs} graph: {graphname} timing: {timing} mid: {mid_layer} norm: {normalization} act: {activations} acc: {accuracy} runs: {run_count} rep: {replication}")
     
diff --git a/gcn_distr_2d.py b/gcn_distr_2d.py
@@ -9,7 +9,8 @@
 import torch.distributed as dist
 
 from torch_geometric.data import Data, Dataset
-from torch_geometric.datasets import Planetoid, PPI, Reddit
+from torch_geometric.datasets import Planetoid, PPI
+from reddit import Reddit
 from torch_geometric.nn import GCNConv, ChebConv  # noqa
 from torch_geometric.utils import (
         add_remaining_self_loops, 
@@ -90,6 +91,7 @@
 no_occur_val = 42.1234
 run_count = 0
 run = 0
+download = False
 
 def sync_and_sleep(rank, device):
     torch.cuda.synchronize(device=device)
@@ -1450,6 +1452,9 @@ def main(P, correctness_check, acc_per_rank):
         data.y = torch.rand(n).uniform_(0, num_classes - 1)
         data.train_mask = torch.ones(n).long()
 
+    if download:
+        exit()
+
     os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
     dist.init_process_group(backend='nccl')
     # dist.init_process_group('gloo', init_method='env://')
@@ -1523,6 +1528,7 @@ def main(P, correctness_check, acc_per_rank):
     parser.add_argument("--normalization", type=str)
     parser.add_argument("--activations", type=str)
     parser.add_argument("--accuracy", type=str)
+    parser.add_argument("--download", type=bool)
     args = parser.parse_args()
     print(args)
     P = args.processes
@@ -1547,10 +1553,12 @@ def main(P, correctness_check, acc_per_rank):
     normalization = args.normalization == "True"
     activations = args.activations == "True"
     accuracy = args.accuracy == "True"
+    download = args.download
 
-    if (epochs is None) or (graphname is None) or (timing is None) or (mid_layer is None) or (run_count is None):
-        print(f"Error: missing argument {epochs} {graphname} {timing} {mid_layer}")
-        exit()
+    if not download:
+        if (epochs is None) or (graphname is None) or (timing is None) or (mid_layer is None) or (run_count is None):
+            print(f"Error: missing argument {epochs} {graphname} {timing} {mid_layer}")
+            exit()
 
     print(f"Arguments: epochs: {epochs} graph: {graphname} timing: {timing} mid: {mid_layer} norm: {normalization} act: {activations} acc: {accuracy}")
     
diff --git a/gcn_distr_3d.py b/gcn_distr_3d.py
@@ -9,7 +9,8 @@
 import torch.distributed as dist
 
 from torch_geometric.data import Data, Dataset
-from torch_geometric.datasets import Planetoid, PPI, Reddit
+from torch_geometric.datasets import Planetoid, PPI
+from reddit import Reddit
 from torch_geometric.nn import GCNConv, ChebConv  # noqa
 from torch_geometric.utils import (
         add_remaining_self_loops, 
@@ -1772,32 +1773,18 @@ def main(P, correctness_check, acc_per_rank):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--processes', metavar='P', type=int,
-                        help='Number of processes')
-    parser.add_argument('--correctness', metavar='C', type=str,
-                        help='Run correctness check')
     parser.add_argument("--accperrank", type=int)
     parser.add_argument("--epochs", type=int)
     parser.add_argument("--graphname", type=str)
     parser.add_argument("--timing", type=str)
     parser.add_argument("--midlayer", type=int)
-    parser.add_argument("--local_rank", type=int)
     args = parser.parse_args()
     print(args)
-    P = args.processes
-    correctness_check = args.correctness
-    if P is None:
-        P = 1
 
     acc_per_rank = args.accperrank
     if acc_per_rank is None:
         acc_per_rank = 1
 
-    if correctness_check is None or correctness_check == "nocheck":
-        correctness_check = False
-    else:
-        correctness_check = True
-
     epochs = args.epochs
     graphname = args.graphname
     timing = args.timing == "True"
diff --git a/reddit.py b/reddit.py
@@ -0,0 +1,70 @@
+import os
+import os.path as osp
+
+import torch
+import numpy as np
+import scipy.sparse as sp
+from torch_sparse import coalesce
+from torch_geometric.data import (InMemoryDataset, Data, download_url,
+                                  extract_zip)
+
+
+class Reddit(InMemoryDataset):
+    r"""The Reddit dataset from the `"Inductive Representation Learning on
+    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper, containing
+    Reddit posts belonging to different communities.
+
+    Args:
+        root (string): Root directory where the dataset should be saved.
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a transformed
+            version. The data object will be transformed before every access.
+            (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.Data` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+    """
+
+    url = 'https://data.dgl.ai/dataset/reddit.zip'
+
+    def __init__(self, root, transform=None, pre_transform=None):
+        super(Reddit, self).__init__(root, transform, pre_transform)
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        return ['reddit_data.npz', 'reddit_graph.npz']
+
+    @property
+    def processed_file_names(self):
+        return 'data.pt'
+
+    def download(self):
+        path = download_url(self.url, self.raw_dir)
+        extract_zip(path, self.raw_dir)
+        os.unlink(path)
+
+    def process(self):
+        data = np.load(osp.join(self.raw_dir, 'reddit_data.npz'))
+        x = torch.from_numpy(data['feature']).to(torch.float)
+        y = torch.from_numpy(data['label']).to(torch.long)
+        split = torch.from_numpy(data['node_types'])
+
+        adj = sp.load_npz(osp.join(self.raw_dir, 'reddit_graph.npz'))
+        row = torch.from_numpy(adj.row).to(torch.long)
+        col = torch.from_numpy(adj.col).to(torch.long)
+        edge_index = torch.stack([row, col], dim=0)
+        edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0))
+
+        data = Data(x=x, edge_index=edge_index, y=y)
+        data.train_mask = split == 1
+        data.val_mask = split == 2
+        data.test_mask = split == 3
+
+        data = data if self.pre_transform is None else self.pre_transform(data)
+
+        torch.save(self.collate([data]), self.processed_paths[0])
+
+    def __repr__(self):
+        return '{}()'.format(self.__class__.__name__)