Skip to content

Commit

Permalink
[DGL-Go] Inference for Graph Prediction Pipeline (dmlc#4157)
Browse files Browse the repository at this point in the history
* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update
  • Loading branch information
mufeili authored Jun 28, 2022
1 parent a5d8460 commit 150e927
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 10 deletions.
9 changes: 4 additions & 5 deletions dglgo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ dgl configure nodepred --data cora --model sage --cfg cora_sage.yaml
```

Note that `nodepred` is the name of DGL-Go *pipeline*. For now, you can think of
pipeline as training task: `nodepred` is for node prediction task; other
options include `linkpred` for link prediction task, etc. The command will
pipeline as training task: `nodepred` is for node multiclass classification task; other
options include `linkpred` for link prediction task, and `graphpred` for graph binary classification etc. The command will
generate a configurate file `cora_sage.yaml` which includes:

* Options for the selected dataset (i.e., `cora` here).
Expand All @@ -86,9 +86,8 @@ The snippet below shows the configuration file generated by the command above.

```yaml
version: 0.0.1
pipeline:
name: nodepred
mode: train
pipeline_name: nodepred
pipeline_mode: train
device: cpu
data:
name: cora
Expand Down
1 change: 1 addition & 0 deletions dglgo/dglgo/apply_pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .nodepred import ApplyNodepredPipeline
from .nodepred_sample import ApplyNodepredNsPipeline
from .graphpred import ApplyGraphpredPipeline
1 change: 1 addition & 0 deletions dglgo/dglgo/apply_pipeline/graphpred/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .gen import *
124 changes: 124 additions & 0 deletions dglgo/dglgo/apply_pipeline/graphpred/gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import ruamel.yaml
import torch
import typer

from copy import deepcopy
from jinja2 import Template
from pathlib import Path
from pydantic import BaseModel, Field
from typing import Optional

from ...utils.factory import ApplyPipelineFactory, PipelineBase, DataFactory, GraphModelFactory
from ...utils.yaml_dump import deep_convert_dict, merge_comment

pipeline_comments = {
"batch_size": "Graph batch size",
"num_workers": "Number of workers for data loading",
"save_path": "Directory to save the inference results"
}

class ApplyGraphpredPipelineCfg(BaseModel):
batch_size: int = 32
num_workers: int = 4
save_path: str = "apply_results"

@ApplyPipelineFactory.register("graphpred")
class ApplyGraphpredPipeline(PipelineBase):
def __init__(self):
self.pipeline = {
"name": "graphpred",
"mode": "apply"
}

@classmethod
def setup_user_cfg_cls(cls):
from ...utils.enter_config import UserConfig
class ApplyGraphPredUserConfig(UserConfig):
data: DataFactory.filter("graphpred").get_pydantic_config() = Field(..., discriminator="name")
general_pipeline: ApplyGraphpredPipelineCfg = ApplyGraphpredPipelineCfg()

cls.user_cfg_cls = ApplyGraphPredUserConfig

@property
def user_cfg_cls(self):
return self.__class__.user_cfg_cls

def get_cfg_func(self):
def config(
data: DataFactory.filter("graphpred").get_dataset_enum() = typer.Option(None, help="input data name"),
cfg: Optional[str] = typer.Option(None, help="output configuration file path"),
cpt: str = typer.Option(..., help="input checkpoint file path")
):
# Training configuration
train_cfg = torch.load(cpt)["cfg"]
if data is None:
print("data is not specified, use the training dataset")
data = train_cfg["data_name"]
else:
data = data.name
if cfg is None:
cfg = "_".join(["apply", "graphpred", data, train_cfg["model_name"]]) + ".yaml"

self.__class__.setup_user_cfg_cls()
generated_cfg = {
"pipeline_name": self.pipeline["name"],
"pipeline_mode": self.pipeline["mode"],
"device": train_cfg["device"],
"data": {"name": data},
"cpt_path": cpt,
"general_pipeline": {"batch_size": train_cfg["general_pipeline"]["eval_batch_size"],
"num_workers": train_cfg["general_pipeline"]["num_workers"]}
}
output_cfg = self.user_cfg_cls(**generated_cfg).dict()
output_cfg = deep_convert_dict(output_cfg)
# Not applicable for inference
output_cfg['data'].pop('split_ratio')
comment_dict = {
"device": "Torch device name, e.g., cpu or cuda or cuda:0",
"cpt_path": "Path to the checkpoint file",
"general_pipeline": pipeline_comments
}
comment_dict = merge_comment(output_cfg, comment_dict)

yaml = ruamel.yaml.YAML()
yaml.dump(comment_dict, Path(cfg).open("w"))
print("Configuration file is generated at {}".format(Path(cfg).absolute()))

return config

@classmethod
def gen_script(cls, user_cfg_dict):
# Check validation
cls.setup_user_cfg_cls()
cls.user_cfg_cls(**user_cfg_dict)

# Training configuration
train_cfg = torch.load(user_cfg_dict["cpt_path"])["cfg"]

# Dict for code rendering
render_cfg = deepcopy(user_cfg_dict)
model_name = train_cfg["model_name"]
model_code = GraphModelFactory.get_source_code(model_name)
render_cfg["model_code"] = model_code
render_cfg["model_class_name"] = GraphModelFactory.get_model_class_name(model_name)
render_cfg.update(DataFactory.get_generated_code_dict(user_cfg_dict["data"]["name"]))

# Dict for defining cfg in the rendered code
generated_user_cfg = deepcopy(user_cfg_dict)
generated_user_cfg.pop("pipeline_name")
generated_user_cfg.pop("pipeline_mode")
# model arch configuration
generated_user_cfg["model"] = train_cfg["model"]

render_cfg["user_cfg_str"] = f"cfg = {str(generated_user_cfg)}"
render_cfg["user_cfg"] = user_cfg_dict

file_current_dir = Path(__file__).resolve().parent
with open(file_current_dir / "graphpred.jinja-py", "r") as f:
template = Template(f.read())

return template.render(**render_cfg)

@staticmethod
def get_description() -> str:
return "Graph classification pipeline for inference on binary classification"
77 changes: 77 additions & 0 deletions dglgo/dglgo/apply_pipeline/graphpred/graphpred.jinja-py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import torch
import os
import csv

from tqdm import tqdm
from dgl.data import AsGraphPredDataset
from dgl.dataloading import GraphDataLoader
{{ data_import_code }}

{{ model_code }}

def infer(device, loader, model):
model = model.to(device)
model.eval()
all_pred = []

with torch.no_grad():
for _, (g, labels) in enumerate(tqdm(loader, desc="Iteration")):
g = g.to(device)
node_feat = g.ndata['feat']
edge_feat = g.edata['feat']
pred = model(g, node_feat, edge_feat)
pred = (pred.sigmoid() >= 0.5).long()
all_pred.append(pred)

return torch.cat(all_pred, dim=0)

def main():
{{ user_cfg_str }}

device = cfg['device']
if not torch.cuda.is_available():
device = 'cpu'
pipeline_cfg = cfg['general_pipeline']

# load data
data = AsGraphPredDataset({{data_initialize_code}})
data_loader = GraphDataLoader(data, batch_size=pipeline_cfg['batch_size'],
num_workers=pipeline_cfg['num_workers'], shuffle=False)

# validation
train_data_name = cfg['model']['data_info']['name']
infer_data_name = cfg['data']['name']
if train_data_name.startswith('ogbg-mol'):
assert infer_data_name.startswith('ogbg-mol'), 'Expect the inference data name to start \
with ogbg-mol, got {}'.format(infer_data_name)
else:
assert train_data_name == infer_data_name, 'Expect the training and inference data to \
have the same name, got {} and {}'.format(train_data_name, infer_data_name)
model_node_feat_size = cfg['model']['data_info']['node_feat_size']
model_edge_feat_size = cfg['model']['data_info']['edge_feat_size']
data_node_feat_size = data.node_feat_size
data_edge_feat_size = data.edge_feat_size
assert model_node_feat_size == data_node_feat_size, 'Expect the training data and inference \
data to have the same number of input node features, got {:d} and {:d}'.format(model_node_feat_size, data_node_feat_size)
assert model_edge_feat_size == data_edge_feat_size, 'Expect the training data and inference \
data to have the same number of input edge features, got {:d} and {:d}'.format(model_edge_feat_size, data_edge_feat_size)

model = {{ model_class_name }}(**cfg['model'])
model.load_state_dict(torch.load(cfg['cpt_path'], map_location='cpu')['model'])
pred = infer(device, data_loader, model).detach().cpu()

# Dump the results
os.makedirs(cfg['general_pipeline']["save_path"])
file_path = os.path.join(cfg['general_pipeline']["save_path"], 'output.csv')
header = ['graph id']
header.extend(['task_{:d}'.format(i) for i in range(cfg['model']['data_info']['out_size'])])
with open(file_path, 'w') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows([
[i] + pred[i].tolist() for i in range(len(pred))
])
print('Saved inference results to {}'.format(file_path))

if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion dglgo/dglgo/pipeline/graphpred/gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,4 @@ def gen_script(cls, user_cfg_dict):

@staticmethod
def get_description() -> str:
return "Graph property prediction pipeline"
return "Graph property prediction pipeline on binary classification"
2 changes: 1 addition & 1 deletion dglgo/dglgo/pipeline/graphpred/graphpred.jinja-py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ if __name__ == '__main__':
}
if cfg["model_name"] == 'pna':
in_deg = torch.cat([g.in_degrees() for (g, _) in data[data.train_idx]])
cfg["model"]["data_info"]["delta"] = torch.mean(torch.log(in_deg + 1))
cfg["model"]["data_info"]["delta"] = torch.mean(torch.log(in_deg + 1)).item()

os.makedirs(cfg['general_pipeline']["save_path"])

Expand Down
5 changes: 2 additions & 3 deletions dglgo/tests/cfg.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
version: 0.0.1
pipeline:
name: nodepred
mode: train
pipeline_name: nodepred
pipeline_mode: train
device: cpu
data:
name: cora
Expand Down

0 comments on commit 150e927

Please sign in to comment.