Fix input data source (dmlc#1612)

Co-authored-by: Ubuntu <[email protected]> Co-authored-by: Jinjing Zhou <[email protected]>
teju85 · Jun 10, 2020 · 8531ee6 · 8531ee6
1 parent 484bbcc
commit 8531ee6
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 35 deletions.
diff --git a/examples/pytorch/deepwalk/README.md b/examples/pytorch/deepwalk/README.md
@@ -12,27 +12,30 @@ The implementation includes multi-processing training with CPU and mixed trainin
 - PyTorch 1.5.0
 - DGL 0.4.3
 
-## How to run the code
 
-Format of a network file:
+## Input data
+Currently, we support two builtin dataset: youtube and blog. Use --data\_file youtube to select youtube dataset and --data\_file blog to select blog dataset.
+The data is avaliable at  https://data.dgl.ai/dataset/DeepWalk/youtube.zip and https://data.dgl.ai/dataset/DeepWalk/blog.zip
+The youtube.zip includes both youtube-net.txt, youtube-vocab.txt and youtube-label.txt; The blog.zip includes both blog-net.txt, blog-vocab.txt and blog-label.txt. 
+
+For other datasets please pass the full path to the trainer through --data\_file and the format of a network file should follow:
 ```
 1(node id) 2(node id)
 1 3
+1 4
+2 4
 ...
 ```
 
+## How to run the code
 To run the code:
 ```
-python3 deepwalk.py --net_file net.txt --emb_file emb.txt --adam --mix --lr 0.2 --num_procs 4 --batch_size 100 --negative 5
+python3 deepwalk.py --data_file youtube --output_emb_file emb.txt --adam --mix --lr 0.2 --gpus 0 1 2 3 --batch_size 100 --negative 5
 ```
 
 ## How to save the embedding
-
-Functions:
-```
-SkipGramModel.save_embedding(dataset, file_name)
-SkipGramModel.save_embedding_txt(dataset, file_name)
-```
+By default the trained embedding is saved under --output\_embe\_file FILE\_NAME as a numpy object.
+To save the trained embedding in raw format(txt format), please use --save\_in\_txt argument.
 
 ## Evaluation
 
@@ -60,4 +63,4 @@ Parameters.
 Speeding-up with mixed CPU & multi-GPU. The used parameters are the same as above.
 |  #GPUs   |   1   |   2   |   4   |
 |----------|-------|-------|-------|
-| Time (s) |1419.64| 952.04|428.89 |
+| Time (s) |1419.64| 952.04|428.89 |
diff --git a/examples/pytorch/deepwalk/deepwalk.py b/examples/pytorch/deepwalk/deepwalk.py
@@ -17,14 +17,14 @@ def __init__(self, args):
         """ Initializing the trainer with the input arguments """
         self.args = args
         self.dataset = DeepwalkDataset(
-            net_file=args.net_file,
+            net_file=args.data_file,
             map_file=args.map_file,
             walk_length=args.walk_length,
             window_size=args.window_size,
             num_walks=args.num_walks,
             batch_size=args.batch_size,
             negative=args.negative,
-            num_procs=args.num_procs,
+            gpus=args.gpus,
             fast_neg=args.fast_neg,
             )
         self.emb_size = len(self.dataset.net)
@@ -36,7 +36,6 @@ def init_device_emb(self):
         """
         choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
         assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
-        assert self.args.num_procs >= 1, "The number of process must be larger than 1"
         choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
         assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"
 
@@ -63,17 +62,21 @@ def init_device_emb(self):
         torch.set_num_threads(self.args.num_threads)
         if self.args.only_gpu:
             print("Run in 1 GPU")
-            self.emb_model.all_to_device(0)
+            assert self.args.gpus[0] >= 0
+            self.emb_model.all_to_device(self.args.gpus[0])
         elif self.args.mix:
-            print("Mix CPU with %d GPU" % self.args.num_procs)
-            if self.args.num_procs == 1:
-                self.emb_model.set_device(0)
+            print("Mix CPU with %d GPU" % len(self.args.gpus))
+            if len(self.args.gpus) == 1:
+                assert self.args.gpus[0] >= 0, 'mix CPU with GPU should have abaliable GPU'
+                self.emb_model.set_device(self.args.gpus[0])
         else:
-            print("Run in %d CPU process" % self.args.num_procs)
+            print("Run in CPU process")
+            self.args.gpus = [torch.device('cpu')]
+
 
     def train(self):
         """ train the embedding """
-        if self.args.num_procs > 1:
+        if len(self.args.gpus) > 1:
             self.fast_train_mp()
         else:
             self.fast_train()
@@ -86,17 +89,19 @@ def fast_train_mp(self):
         start_all = time.time()
         ps = []
 
-        np_ = self.args.num_procs
-        for i in range(np_):
-            p = mp.Process(target=self.fast_train_sp, args=(i,))
+        for i in range(len(self.args.gpus)):
+            p = mp.Process(target=self.fast_train_sp, args=(self.args.gpus[i],))
             ps.append(p)
             p.start()
 
         for p in ps:
             p.join()
 
         print("Used time: %.2fs" % (time.time()-start_all))
-        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
+        if self.args.save_in_txt:
+            self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
+        else:
+            self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
 
     @thread_wrapped_func
     def fast_train_sp(self, gpu_id):
@@ -198,14 +203,19 @@ def fast_train(self):
                         start = time.time()
 
         print("Training used time: %.2fs" % (time.time()-start_all))
-        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
+        if self.args.save_in_txt:
+            self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
+        else:
+            self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="DeepWalk")
-    parser.add_argument('--net_file', type=str, 
-            help="path of the txt network file")
-    parser.add_argument('--emb_file', type=str, default="emb.npy",
-            help='path of the npy embedding file')
+    parser.add_argument('--data_file', type=str, 
+            help="path of the txt network file, builtin dataset include youtube-net and blog-net") 
+    parser.add_argument('--save_in_txt', default=False, action="store_true",
+            help='Whether save dat in txt format or npy')
+    parser.add_argument('--output_emb_file', type=str, default="emb.npy",
+            help='path of the output npy embedding file')
     parser.add_argument('--map_file', type=str, default="nodeid_to_index.pickle",
             help='path of the mapping dict that maps node ids to embedding index')
     parser.add_argument('--dim', default=128, type=int, 
@@ -246,11 +256,11 @@ def fast_train(self):
             help="average gradients of sgd for embedding updation")
     parser.add_argument('--num_threads', default=2, type=int, 
             help="number of threads used for each CPU-core/GPU")
-    parser.add_argument('--num_procs', default=1, type=int, 
-            help="number of GPUs/CPUs when mixed training")
+    parser.add_argument('--gpus', type=int, default=[-1], nargs='+', 
+            help='a list of active gpu ids, e.g. 0')
     args = parser.parse_args()
 
     start_time = time.time()
     trainer = DeepwalkTrainer(args)
     trainer.train()
-    print("Total used time: %.2f" % (time.time() - start_time))
+    print("Total used time: %.2f" % (time.time() - start_time))
diff --git a/examples/pytorch/deepwalk/reading_data.py b/examples/pytorch/deepwalk/reading_data.py
@@ -1,13 +1,15 @@
+import os
 import numpy as np
 import scipy.sparse as sp
 import pickle
 import torch
 from torch.utils.data import DataLoader
+from dgl.data.utils import download, _get_dgl_url, get_download_dir, extract_archive
 import random
 import time
 import dgl
 from utils import shuffle_walks
-np.random.seed(3141592653)
+#np.random.seed(3141592653)
 
 def ReadTxtNet(file_path="", undirected=True):
     """ Read the txt network file. 
@@ -24,6 +26,15 @@ def ReadTxtNet(file_path="", undirected=True):
     node2id dict : a dict mapping the nodes to their embedding indices 
     id2node dict : a dict mapping nodes embedding indices to the nodes
     """
+    if file_path == 'youtube' or file_path == 'blog':
+        name = file_path
+        dir = get_download_dir()
+        zip_file_path='{}/{}.zip'.format(dir, name)
+        download(_get_dgl_url(os.path.join('dataset/DeepWalk/', '{}.zip'.format(file_path))), path=zip_file_path)
+        extract_archive(zip_file_path,
+                        '{}/{}'.format(dir, name))
+        file_path = "{}/{}/{}-net.txt".format(dir, name, name)
+
     node2id = {}
     id2node = {}
     cid = 0
@@ -97,7 +108,7 @@ def __init__(self,
             num_walks=10,
             batch_size=32,
             negative=5,
-            num_procs=4,
+            gpus=[0],
             fast_neg=True,
             ):
         """ This class has the following functions:
@@ -121,7 +132,7 @@ def __init__(self,
         self.num_walks = num_walks
         self.batch_size = batch_size
         self.negative = negative
-        self.num_procs = num_procs
+        self.num_procs = len(gpus)
         self.fast_neg = fast_neg
         self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file)
         self.save_mapping(map_file)
@@ -175,4 +186,4 @@ def __init__(self, G, seeds, walk_length):
     def sample(self, seeds):
         walks = dgl.contrib.sampling.random_walk(self.G, seeds, 
             1, self.walk_length-1)
-        return walks
+        return walks