[DGL-Go] Inference for Node Prediction Pipeline (full & ns) (dmlc#4095)

* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update
anirband · Jun 21, 2022 · 31e4a89 · 31e4a89
1 parent 6922658
commit 31e4a89
Show file tree

Hide file tree

Showing 43 changed files with 719 additions and 186 deletions.
diff --git a/dglgo/README.md b/dglgo/README.md
@@ -10,7 +10,23 @@ experiments.
 ## Installation and get started
 
 DGL-Go requires DGL v0.8+ so please make sure DGL is updated properly.
-Install DGL-Go by `pip install dglgo` and type `dgl` in your console:
+
+### Install the latest stable version
+
+```
+pip install dglgo
+```
+
+### Install from source for experimental features
+
+```
+python setup.py install
+```
+
+### Get started
+
+Type `dgl` in your console:
+
 ```
 Usage: dgl [OPTIONS] COMMAND [ARGS]...
 
@@ -63,14 +79,16 @@ generate a configurate file `cora_sage.yaml` which includes:
 * Training hyperparameters (e.g., learning rate, loss function, etc.).
 
 Different choices of task, model and datasets may give very different options,
-so DGL-Go also adds a comment for what each option does in the file.
+so DGL-Go also adds a comment per option for explanation.
 At this point you can also change options to explore optimization potentials.
 
-Below shows the configuration file generated by the command above.
+The snippet below shows the configuration file generated by the command above.
 
 ```yaml
 version: 0.0.1
-pipeline_name: nodepred
+pipeline:
+  name: nodepred
+  mode: train
 device: cpu
 data:
   name: cora
@@ -94,7 +112,7 @@ general_pipeline:
     lr: 0.01
     weight_decay: 0.0005
   loss: CrossEntropyLoss
-  save_path: model.pth        # Path to save the model
+  save_path: results          # Directory to save the experiment results
   num_runs: 1                 # Number of experiments to run
 ```
 
@@ -216,14 +234,18 @@ class GraphSAGE(nn.Module):
 
         for i in range(num_layers):
             in_hidden = hidden_size if i > 0 else in_size
-            out_hidden = hidden_size if i < num_layers - 1 else data_info["out_size"]
-            self.layers.append(dgl.nn.SAGEConv( in_hidden, out_hidden, aggregator_type))
+            out_hidden = hidden_size if i < num_layers - \
+                1 else data_info["out_size"]
+            self.layers.append(
+                dgl.nn.SAGEConv(
+                    in_hidden,
+                    out_hidden,
+                    aggregator_type))
 
     def forward(self, graph, node_feat, edge_feat=None):
         if self.embed_size > 0:
             dgl_warning(
-                "The embedding for node feature is used, and input node_feat is ignored, due to the provided embed_size.",
-                norepeat=True)
+                "The embedding for node feature is used, and input node_feat is ignored, due to the provided embed_size.")
             h = self.embed.weight
         else:
             h = node_feat
@@ -272,6 +294,7 @@ def train(cfg, pipeline_cfg, device, data, model, optimizer, loss_fcn):
               format(epoch, loss.item(), train_acc, val_acc))
 
     stopper.load_checkpoint(model)
+    stopper.close()
 
     model.eval()
     with torch.no_grad():
@@ -280,40 +303,10 @@ def train(cfg, pipeline_cfg, device, data, model, optimizer, loss_fcn):
     return test_acc
 
 
-def main():
-    cfg = {
-        'version': '0.0.1',
-        'device': 'cuda:0',
-        'model': {
-            'embed_size': -1,
-            'hidden_size': 16,
-            'num_layers': 2,
-            'activation': 'relu',
-            'dropout': 0.5,
-            'aggregator_type': 'gcn'},
-        'general_pipeline': {
-            'early_stop': {
-                'patience': 100,
-                'checkpoint_path': 'checkpoint.pth'},
-            'num_epochs': 200,
-            'eval_period': 5,
-            'optimizer': {
-                'lr': 0.01,
-                'weight_decay': 0.0005},
-            'loss': 'CrossEntropyLoss',
-            'save_path': 'model.pth',
-            'num_runs': 10}}
+def main(run, cfg, data):
     device = cfg['device']
     pipeline_cfg = cfg['general_pipeline']
-    # load data
-    data = AsNodePredDataset(CoraGraphDataset())
     # create model
-    model_cfg = cfg["model"]
-    cfg["model"]["data_info"] = {
-        "in_size": model_cfg['embed_size'] if model_cfg['embed_size'] > 0 else data[0].ndata['feat'].shape[1],
-        "out_size": data.num_classes,
-        "num_nodes": data[0].num_nodes()
-    }
     model = GraphSAGE(**cfg["model"])
     model = model.to(device)
     loss = torch.nn.CrossEntropyLoss()
@@ -322,10 +315,36 @@ def main():
         **pipeline_cfg["optimizer"])
     # train
     test_acc = train(cfg, pipeline_cfg, device, data, model, optimizer, loss)
-    torch.save(model.state_dict(), pipeline_cfg["save_path"])
+    torch.save({'cfg': cfg, 'model': model.state_dict()},
+               os.path.join(pipeline_cfg["save_path"], 'run_{}.pth'.format(run)))
+
     return test_acc
 
-...
+if __name__ == '__main__':
+    ...
+
+    # load data
+    data = AsNodePredDataset(CoraGraphDataset())
+
+    model_cfg = cfg["model"]
+    cfg["model"]["data_info"] = {
+        "in_size": model_cfg['embed_size'] if model_cfg['embed_size'] > 0 else data[0].ndata['feat'].shape[1],
+        "out_size": data.num_classes,
+        "num_nodes": data[0].num_nodes()
+    }
+
+    os.makedirs(cfg['general_pipeline']["save_path"])
+
+    all_acc = []
+    num_runs = 1
+    for run in range(num_runs):
+        print(f'Run experiment #{run}')
+        test_acc = main(run, cfg, data)
+        print("Test Accuracy {:.4f}".format(test_acc))
+        all_acc.append(test_acc)
+    avg_acc = np.round(np.mean(all_acc), 6)
+    std_acc = np.round(np.std(all_acc), 6)
+    print(f'Accuracy across {num_runs} runs: {avg_acc} ± {std_acc}')
 ```
 
 You can see that everything is collected into one Python script which includes the
@@ -396,6 +415,15 @@ all the available pipelines.
 A: Currently not supported. We will enable this feature soon. Please stay tuned!
 
 **Q: After training a model on some dataset, how can I apply it to another one?**
-A: The `save_path` option in the generated configuration file allows you to specify where
-to save the model after training. You can then modify the script generated by `dgl export`
-to load the the model checkpoint and evaluate it on another dataset.
+A: The `save_path` option in the generated configuration file allows you to specify the directory to save the experiment results. After training, `{save_path}/run_{i}.pth` will be the checkpoint for the i-th run, consisting of the training configuration and trained model state dict. You can then use `dgl apply` as follows.
+
+```
+dgl configure-apply X --data Y --cpt {save_path}/run_{i}.pth --cfg Z
+dgl apply --cfg Z
+```
+
+- `X` is the pipeline name as in `dgl configure`.
+- `Y` is the dataset to apply and can be omitted if you are applying the trained model to the training dataset.
+- `Z` is the configuration file and a default value will be used if not specified.
+
+You can also use `dgl export --cfg Z` to generate a python script for further modification.
diff --git a/dglgo/dglgo/apply_pipeline/__init__.py b/dglgo/dglgo/apply_pipeline/__init__.py
@@ -0,0 +1,2 @@
+from .nodepred import ApplyNodepredPipeline
+from .nodepred_sample import ApplyNodepredNsPipeline
diff --git a/dglgo/dglgo/apply_pipeline/nodepred/__init__.py b/dglgo/dglgo/apply_pipeline/nodepred/__init__.py
@@ -0,0 +1 @@
+from .gen import *
diff --git a/dglgo/dglgo/apply_pipeline/nodepred/gen.py b/dglgo/dglgo/apply_pipeline/nodepred/gen.py
@@ -0,0 +1,113 @@
+import ruamel.yaml
+import torch
+import typer
+
+from copy import deepcopy
+from jinja2 import Template
+from pathlib import Path
+from pydantic import Field
+from typing import Optional
+
+from ...utils.factory import ApplyPipelineFactory, PipelineBase, DataFactory, NodeModelFactory
+from ...utils.yaml_dump import deep_convert_dict, merge_comment
+
+@ApplyPipelineFactory.register("nodepred")
+class ApplyNodepredPipeline(PipelineBase):
+
+    def __init__(self):
+        self.pipeline = {
+            "name": "nodepred",
+            "mode": "apply"
+        }
+
+    @classmethod
+    def setup_user_cfg_cls(cls):
+        from ...utils.enter_config import UserConfig
+        class ApplyNodePredUserConfig(UserConfig):
+            data: DataFactory.filter("nodepred").get_pydantic_config() = Field(..., discriminator="name")
+
+        cls.user_cfg_cls = ApplyNodePredUserConfig
+
+    @property
+    def user_cfg_cls(self):
+        return self.__class__.user_cfg_cls
+
+    def get_cfg_func(self):
+        def config(
+            data: DataFactory.filter("nodepred").get_dataset_enum() = typer.Option(None, help="input data name"),
+            cfg: Optional[str] = typer.Option(None, help="output configuration file path"),
+            cpt: str = typer.Option(..., help="input checkpoint file path")
+        ):
+            # Training configuration
+            train_cfg = torch.load(cpt)["cfg"]
+            if data is None:
+                print("data is not specified, use the training dataset")
+                data = train_cfg["data_name"]
+            else:
+                data = data.name
+            if cfg is None:
+                cfg = "_".join(["apply", "nodepred", data, train_cfg["model_name"]]) + ".yaml"
+
+            self.__class__.setup_user_cfg_cls()
+            generated_cfg = {
+                "pipeline_name": self.pipeline["name"],
+                "pipeline_mode": self.pipeline["mode"],
+                "device": train_cfg["device"],
+                "data": {"name": data},
+                "cpt_path": cpt,
+                "general_pipeline": {"save_path": "apply_results"}
+            }
+            output_cfg = self.user_cfg_cls(**generated_cfg).dict()
+            output_cfg = deep_convert_dict(output_cfg)
+            # Not applicable for inference
+            output_cfg['data'].pop('split_ratio')
+            comment_dict = {
+                "device": "Torch device name, e.g., cpu or cuda or cuda:0",
+                "cpt_path": "Path to the checkpoint file",
+                "general_pipeline": {"save_path": "Directory to save the inference results"}
+            }
+            comment_dict = merge_comment(output_cfg, comment_dict)
+
+            yaml = ruamel.yaml.YAML()
+            yaml.dump(comment_dict, Path(cfg).open("w"))
+            print("Configuration file is generated at {}".format(Path(cfg).absolute()))
+
+        return config
+
+    @classmethod
+    def gen_script(cls, user_cfg_dict):
+        # Check validation
+        cls.setup_user_cfg_cls()
+        cls.user_cfg_cls(**user_cfg_dict)
+
+        # Training configuration
+        train_cfg = torch.load(user_cfg_dict["cpt_path"])["cfg"]
+
+        # Dict for code rendering
+        render_cfg = deepcopy(user_cfg_dict)
+        model_name = train_cfg["model_name"]
+        model_code = NodeModelFactory.get_source_code(model_name)
+        render_cfg["model_code"] = model_code
+        render_cfg["model_class_name"] = NodeModelFactory.get_model_class_name(model_name)
+        render_cfg.update(DataFactory.get_generated_code_dict(user_cfg_dict["data"]["name"]))
+
+        # Dict for defining cfg in the rendered code
+        generated_user_cfg = deepcopy(user_cfg_dict)
+        generated_user_cfg["data"].pop("name")
+        generated_user_cfg.pop("pipeline_name")
+        generated_user_cfg.pop("pipeline_mode")
+        # model arch configuration
+        generated_user_cfg["model"] = train_cfg["model"]
+
+        render_cfg["user_cfg_str"] = f"cfg = {str(generated_user_cfg)}"
+        render_cfg["user_cfg"] = user_cfg_dict
+
+        file_current_dir = Path(__file__).resolve().parent
+        with open(file_current_dir / "nodepred.jinja-py", "r") as f:
+            template = Template(f.read())
+
+        return template.render(**render_cfg)
+
+    @staticmethod
+    def get_description() -> str:
+        return "Node classification pipeline for inference"
diff --git a/dglgo/dglgo/apply_pipeline/nodepred/nodepred.jinja-py b/dglgo/dglgo/apply_pipeline/nodepred/nodepred.jinja-py
@@ -0,0 +1,67 @@
+import torch
+import dgl
+import os
+import csv
+
+from dgl.data import AsNodePredDataset
+{{ data_import_code }}
+
+{{ model_code }}
+
+def infer(device, data, model):
+    g = data[0] # Only infer on the first graph
+    g = dgl.remove_self_loop(g)
+    g = dgl.add_self_loop(g)
+    g = g.to(device)
+
+    node_feat = g.ndata.get('feat', None)
+    edge_feat = g.edata.get('feat', None)
+
+    model = model.to(device)
+    model.eval()
+
+    with torch.no_grad():
+        logits = model(g, node_feat, edge_feat)
+
+    return logits
+
+def main():
+    {{ user_cfg_str }}
+
+    device = cfg['device']
+    if not torch.cuda.is_available():
+        device = 'cpu'
+
+    # load data
+    data = AsNodePredDataset({{data_initialize_code}})
+    # validation
+    if cfg['model']['embed_size'] > 0:
+        model_num_nodes = cfg['model']['data_info']['num_nodes']
+        data_num_nodes = data[0].num_nodes()
+        assert model_num_nodes == data_num_nodes, \
+            'Training and inference need to be on the same dataset when node embeddings were learned from scratch'
+    else:
+        model_in_size = cfg['model']['data_info']['in_size']
+        data_in_size = data[0].ndata['feat'].shape[1]
+        assert model_in_size == data_in_size, \
+            'Expect the training data and inference data to have the same number of input node \
+                features, got {:d} and {:d}'.format(model_in_size, data_in_size)
+
+    model = {{ model_class_name }}(**cfg['model'])
+    model.load_state_dict(torch.load(cfg['cpt_path'], map_location='cpu')['model'])
+    logits = infer(device, data, model)
+    pred = logits.argmax(dim=1).cpu()
+
+    # Dump the results
+    os.makedirs(cfg['general_pipeline']["save_path"])
+    file_path = os.path.join(cfg['general_pipeline']["save_path"], 'output.csv')
+    with open(file_path, 'w') as f:
+        writer = csv.writer(f)
+        writer.writerow(['node id', 'predicted label'])
+        writer.writerows([
+            [i, pred[i].item()] for i in range(len(pred))
+        ])
+    print('Saved inference results to {}'.format(file_path))
+
+if __name__ == '__main__':
+    main()
diff --git a/dglgo/dglgo/apply_pipeline/nodepred_sample/__init__.py b/dglgo/dglgo/apply_pipeline/nodepred_sample/__init__.py
@@ -0,0 +1 @@
+from .gen import *
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .nodepred import ApplyNodepredPipeline
		from .nodepred_sample import ApplyNodepredNsPipeline