Skip to content

Commit

Permalink
Fix input data source (dmlc#1612)
Browse files Browse the repository at this point in the history
Co-authored-by: Ubuntu <[email protected]>
Co-authored-by: Jinjing Zhou <[email protected]>
  • Loading branch information
3 people authored Jun 10, 2020
1 parent 484bbcc commit 8531ee6
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 35 deletions.
23 changes: 13 additions & 10 deletions examples/pytorch/deepwalk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,30 @@ The implementation includes multi-processing training with CPU and mixed trainin
- PyTorch 1.5.0
- DGL 0.4.3

## How to run the code

Format of a network file:
## Input data
Currently, we support two builtin dataset: youtube and blog. Use --data\_file youtube to select youtube dataset and --data\_file blog to select blog dataset.
The data is avaliable at https://data.dgl.ai/dataset/DeepWalk/youtube.zip and https://data.dgl.ai/dataset/DeepWalk/blog.zip
The youtube.zip includes both youtube-net.txt, youtube-vocab.txt and youtube-label.txt; The blog.zip includes both blog-net.txt, blog-vocab.txt and blog-label.txt.

For other datasets please pass the full path to the trainer through --data\_file and the format of a network file should follow:
```
1(node id) 2(node id)
1 3
1 4
2 4
...
```

## How to run the code
To run the code:
```
python3 deepwalk.py --net_file net.txt --emb_file emb.txt --adam --mix --lr 0.2 --num_procs 4 --batch_size 100 --negative 5
python3 deepwalk.py --data_file youtube --output_emb_file emb.txt --adam --mix --lr 0.2 --gpus 0 1 2 3 --batch_size 100 --negative 5
```

## How to save the embedding

Functions:
```
SkipGramModel.save_embedding(dataset, file_name)
SkipGramModel.save_embedding_txt(dataset, file_name)
```
By default the trained embedding is saved under --output\_embe\_file FILE\_NAME as a numpy object.
To save the trained embedding in raw format(txt format), please use --save\_in\_txt argument.

## Evaluation

Expand Down Expand Up @@ -60,4 +63,4 @@ Parameters.
Speeding-up with mixed CPU & multi-GPU. The used parameters are the same as above.
| #GPUs | 1 | 2 | 4 |
|----------|-------|-------|-------|
| Time (s) |1419.64| 952.04|428.89 |
| Time (s) |1419.64| 952.04|428.89 |
52 changes: 31 additions & 21 deletions examples/pytorch/deepwalk/deepwalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def __init__(self, args):
""" Initializing the trainer with the input arguments """
self.args = args
self.dataset = DeepwalkDataset(
net_file=args.net_file,
net_file=args.data_file,
map_file=args.map_file,
walk_length=args.walk_length,
window_size=args.window_size,
num_walks=args.num_walks,
batch_size=args.batch_size,
negative=args.negative,
num_procs=args.num_procs,
gpus=args.gpus,
fast_neg=args.fast_neg,
)
self.emb_size = len(self.dataset.net)
Expand All @@ -36,7 +36,6 @@ def init_device_emb(self):
"""
choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
assert self.args.num_procs >= 1, "The number of process must be larger than 1"
choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"

Expand All @@ -63,17 +62,21 @@ def init_device_emb(self):
torch.set_num_threads(self.args.num_threads)
if self.args.only_gpu:
print("Run in 1 GPU")
self.emb_model.all_to_device(0)
assert self.args.gpus[0] >= 0
self.emb_model.all_to_device(self.args.gpus[0])
elif self.args.mix:
print("Mix CPU with %d GPU" % self.args.num_procs)
if self.args.num_procs == 1:
self.emb_model.set_device(0)
print("Mix CPU with %d GPU" % len(self.args.gpus))
if len(self.args.gpus) == 1:
assert self.args.gpus[0] >= 0, 'mix CPU with GPU should have abaliable GPU'
self.emb_model.set_device(self.args.gpus[0])
else:
print("Run in %d CPU process" % self.args.num_procs)
print("Run in CPU process")
self.args.gpus = [torch.device('cpu')]


def train(self):
""" train the embedding """
if self.args.num_procs > 1:
if len(self.args.gpus) > 1:
self.fast_train_mp()
else:
self.fast_train()
Expand All @@ -86,17 +89,19 @@ def fast_train_mp(self):
start_all = time.time()
ps = []

np_ = self.args.num_procs
for i in range(np_):
p = mp.Process(target=self.fast_train_sp, args=(i,))
for i in range(len(self.args.gpus)):
p = mp.Process(target=self.fast_train_sp, args=(self.args.gpus[i],))
ps.append(p)
p.start()

for p in ps:
p.join()

print("Used time: %.2fs" % (time.time()-start_all))
self.emb_model.save_embedding(self.dataset, self.args.emb_file)
if self.args.save_in_txt:
self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
else:
self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)

@thread_wrapped_func
def fast_train_sp(self, gpu_id):
Expand Down Expand Up @@ -198,14 +203,19 @@ def fast_train(self):
start = time.time()

print("Training used time: %.2fs" % (time.time()-start_all))
self.emb_model.save_embedding(self.dataset, self.args.emb_file)
if self.args.save_in_txt:
self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
else:
self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="DeepWalk")
parser.add_argument('--net_file', type=str,
help="path of the txt network file")
parser.add_argument('--emb_file', type=str, default="emb.npy",
help='path of the npy embedding file')
parser.add_argument('--data_file', type=str,
help="path of the txt network file, builtin dataset include youtube-net and blog-net")
parser.add_argument('--save_in_txt', default=False, action="store_true",
help='Whether save dat in txt format or npy')
parser.add_argument('--output_emb_file', type=str, default="emb.npy",
help='path of the output npy embedding file')
parser.add_argument('--map_file', type=str, default="nodeid_to_index.pickle",
help='path of the mapping dict that maps node ids to embedding index')
parser.add_argument('--dim', default=128, type=int,
Expand Down Expand Up @@ -246,11 +256,11 @@ def fast_train(self):
help="average gradients of sgd for embedding updation")
parser.add_argument('--num_threads', default=2, type=int,
help="number of threads used for each CPU-core/GPU")
parser.add_argument('--num_procs', default=1, type=int,
help="number of GPUs/CPUs when mixed training")
parser.add_argument('--gpus', type=int, default=[-1], nargs='+',
help='a list of active gpu ids, e.g. 0')
args = parser.parse_args()

start_time = time.time()
trainer = DeepwalkTrainer(args)
trainer.train()
print("Total used time: %.2f" % (time.time() - start_time))
print("Total used time: %.2f" % (time.time() - start_time))
19 changes: 15 additions & 4 deletions examples/pytorch/deepwalk/reading_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import os
import numpy as np
import scipy.sparse as sp
import pickle
import torch
from torch.utils.data import DataLoader
from dgl.data.utils import download, _get_dgl_url, get_download_dir, extract_archive
import random
import time
import dgl
from utils import shuffle_walks
np.random.seed(3141592653)
#np.random.seed(3141592653)

def ReadTxtNet(file_path="", undirected=True):
""" Read the txt network file.
Expand All @@ -24,6 +26,15 @@ def ReadTxtNet(file_path="", undirected=True):
node2id dict : a dict mapping the nodes to their embedding indices
id2node dict : a dict mapping nodes embedding indices to the nodes
"""
if file_path == 'youtube' or file_path == 'blog':
name = file_path
dir = get_download_dir()
zip_file_path='{}/{}.zip'.format(dir, name)
download(_get_dgl_url(os.path.join('dataset/DeepWalk/', '{}.zip'.format(file_path))), path=zip_file_path)
extract_archive(zip_file_path,
'{}/{}'.format(dir, name))
file_path = "{}/{}/{}-net.txt".format(dir, name, name)

node2id = {}
id2node = {}
cid = 0
Expand Down Expand Up @@ -97,7 +108,7 @@ def __init__(self,
num_walks=10,
batch_size=32,
negative=5,
num_procs=4,
gpus=[0],
fast_neg=True,
):
""" This class has the following functions:
Expand All @@ -121,7 +132,7 @@ def __init__(self,
self.num_walks = num_walks
self.batch_size = batch_size
self.negative = negative
self.num_procs = num_procs
self.num_procs = len(gpus)
self.fast_neg = fast_neg
self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file)
self.save_mapping(map_file)
Expand Down Expand Up @@ -175,4 +186,4 @@ def __init__(self, G, seeds, walk_length):
def sample(self, seeds):
walks = dgl.contrib.sampling.random_walk(self.G, seeds,
1, self.walk_length-1)
return walks
return walks

0 comments on commit 8531ee6

Please sign in to comment.