[Model] SIGN for OGB dataset (dmlc#2316)

* sign for ogbn products, arxiv, mag * texts * fix * update ogb folder readme * use dgl nightly build Co-authored-by: Mufei Li <[email protected]>
zeta1999 · Nov 3, 2020 · b8cc26e · b8cc26e
1 parent c6890c2
commit b8cc26e
Show file tree

Hide file tree

Showing 5 changed files with 397 additions and 2 deletions.
diff --git a/examples/pytorch/ogb/README.md b/examples/pytorch/ogb/README.md
@@ -4,7 +4,12 @@ This directory lists the submissions made from DGL Team to the OGB Leaderboard.
 
 Currently it contains:
 
-* OGB-Products
+* OGBN-Products
   * GraphSAGE with Neighbor Sampling
-* OGB-Proteins
+  * SIGN
+* OGBN-Proteins
   * MWE-GCN and MWE-DGCN ([GCN models for graphs with multi-dimensionally weighted edges](https://cims.nyu.edu/~chenzh/files/GCN_with_edge_weights.pdf))
+* OGBN-Arxiv
+  * SIGN
+* OGBN-Mag
+  * SIGN
diff --git a/examples/pytorch/ogb/sign/.gitignore b/examples/pytorch/ogb/sign/.gitignore
@@ -0,0 +1 @@
+dataset
diff --git a/examples/pytorch/ogb/sign/README.md b/examples/pytorch/ogb/sign/README.md
@@ -0,0 +1,50 @@
+SIGN: Scalable Inception Graph Neural Network
+==========================
+Paper: [https://arxiv.org/abs/2004.11198](https://arxiv.org/abs/2004.11198)
+
+
+Dependencies
+------------
+- pytorch 1.5
+- dgl 0.5 nightly build
+    - `pip install --pre dgl`
+- ogb 1.2.3
+
+
+How to run
+-------------
+### ogbn-products
+```python
+python3 sign.py --dataset ogbn-products --eval-ev 10 --R 5 --input-d 0.3 --num-h 512 \
+    --dr 0.4 --lr 0.001 --batch-size 50000 --num-runs 10
+```
+
+### ogbn-arxiv
+```python
+python3 sign.py --dataset ogbn-arxiv --eval-ev 10 --R 5 --input-d 0.1 --num-h 512 \
+    --dr 0.5 --lr 0.001 --eval-b 100000 --num-runs 10
+```
+
+### ogbn-mag
+ogbn-mag is a heterogeneous graph and the task is to predict publishing venue
+of papers. Since SIGN model is designed for homogeneous graph, we simply ignore
+heterogeneous information (i.e. node and edge types) and treat the graph as a
+homogeneous one. For node types that don't have input feature, we featurize them
+with the average of their neighbors' features.
+
+```python
+python3 sign.py --dataset ogbn-mag --eval-ev 10 --R 5 --input-d 0 --num-h 512 \
+    --dr 0.5 --lr 0.001 --batch-size 50000 --num-runs 10
+```
+
+
+Results
+----------
+Table below shows the average and standard deviation (over 10 times) of
+accuracy. Experiments were performed on Tesla T4 (15GB) GPU on Oct 29.
+
+| Dataset         | Test Accuracy   | Validation Accuracy   | # Params    |
+| :-------------: | :-------------: | :-------------------: | :---------: |
+| ogbn-products   | 0.8052±0.0016   | 0.9299±0.0004         | 3,483,703   |
+| ogbn-arxiv      | 0.7195±0.0011   | 0.7323±0.0006         | 3,566,128   |
+| ogbn-mag        | 0.4046±0.0012   | 0.4068±0.0010         | 3,724,645   |
diff --git a/examples/pytorch/ogb/sign/dataset.py b/examples/pytorch/ogb/sign/dataset.py
@@ -0,0 +1,90 @@
+import torch
+import numpy as np
+import dgl
+import dgl.function as fn
+from ogb.nodeproppred import DglNodePropPredDataset, Evaluator
+
+
+def get_ogb_evaluator(dataset):
+    """
+    Get evaluator from Open Graph Benchmark based on dataset
+    """
+    evaluator = Evaluator(name=dataset)
+    return lambda preds, labels: evaluator.eval({
+        "y_true": labels.view(-1, 1),
+        "y_pred": preds.view(-1, 1),
+    })["acc"]
+
+
+def convert_mag_to_homograph(g, device):
+    """
+    Featurize node types that don't have input features (i.e. author,
+    institution, field_of_study) by averaging their neighbor features.
+    Then convert the graph to a undirected homogeneous graph.
+    """
+    src_writes, dst_writes = g.all_edges(etype="writes")
+    src_topic, dst_topic = g.all_edges(etype="has_topic")
+    src_aff, dst_aff = g.all_edges(etype="affiliated_with")
+    new_g = dgl.heterograph({
+        ("paper", "written", "author"): (dst_writes, src_writes),
+        ("paper", "has_topic", "field"): (src_topic, dst_topic),
+        ("author", "aff", "inst"): (src_aff, dst_aff)
+    })
+    new_g = new_g.to(device)
+    new_g.nodes["paper"].data["feat"] = g.nodes["paper"].data["feat"]
+    new_g["written"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
+    new_g["has_topic"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
+    new_g["aff"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
+    g.nodes["author"].data["feat"] = new_g.nodes["author"].data["feat"]
+    g.nodes["institution"].data["feat"] = new_g.nodes["inst"].data["feat"]
+    g.nodes["field_of_study"].data["feat"] = new_g.nodes["field"].data["feat"]
+
+    # Convert to homogeneous graph
+    # Get DGL type id for paper type
+    target_type_id = g.get_ntype_id("paper")
+    g = dgl.to_homogeneous(g, ndata=["feat"])
+    g = dgl.add_reverse_edges(g, copy_ndata=True)
+    # Mask for paper nodes
+    g.ndata["target_mask"] = g.ndata[dgl.NTYPE] == target_type_id
+    return g
+
+
+def load_dataset(name, device):
+    """
+    Load dataset and move graph and features to device
+    """
+    if name not in ["ogbn-products", "ogbn-arxiv", "ogbn-mag"]:
+        raise RuntimeError("Dataset {} is not supported".format(name))
+    dataset = DglNodePropPredDataset(name=name)
+    splitted_idx = dataset.get_idx_split()
+    train_nid = splitted_idx["train"]
+    val_nid = splitted_idx["valid"]
+    test_nid = splitted_idx["test"]
+    g, labels = dataset[0]
+    g = g.to(device)
+    if name == "ogbn-arxiv":
+        g = dgl.add_reverse_edges(g, copy_ndata=True)
+        g = dgl.add_self_loop(g)
+        g.ndata['feat'] = g.ndata['feat'].float()
+    elif name == "ogbn-mag":
+        # MAG is a heterogeneous graph. The task is to make prediction for
+        # paper nodes
+        labels = labels["paper"]
+        train_nid = train_nid["paper"]
+        val_nid = val_nid["paper"]
+        test_nid = test_nid["paper"]
+        g = convert_mag_to_homograph(g, device)
+    else:
+        g.ndata['feat'] = g.ndata['feat'].float()
+    n_classes = dataset.num_classes
+    labels = labels.squeeze()
+    evaluator = get_ogb_evaluator(name)
+
+    print(f"# Nodes: {g.number_of_nodes()}\n"
+          f"# Edges: {g.number_of_edges()}\n"
+          f"# Train: {len(train_nid)}\n"
+          f"# Val: {len(val_nid)}\n"
+          f"# Test: {len(test_nid)}\n"
+          f"# Classes: {n_classes}")
+
+    return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator