reduce the training time (d2l-ai#558)

ksbsk · Dec 9, 2019 · efa5a13 · efa5a13
1 parent 808e085
commit efa5a13
Show file tree

Hide file tree

Showing 11 changed files with 466 additions and 459 deletions.
diff --git a/chapter_recommender-systems/autorec.md b/chapter_recommender-systems/autorec.md
@@ -27,11 +27,12 @@ $$
 
 where $\| \cdot \|_{\mathcal{O}}$ means only the contribution of observed ratings are considered, that is, only weights that are associated with observed inputs are updated during backpropagation.
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 import d2l
 from mxnet import autograd, init, gluon, np, npx
 from mxnet.gluon import nn
 import mxnet as mx
+import sys
 npx.set_np()
 ```
 
@@ -87,14 +88,17 @@ _, _, _, train_inter_mat = d2l.load_data_ml100k(train_data, num_users,
                                                 num_items)
 _, _, _, test_inter_mat = d2l.load_data_ml100k(test_data, num_users,
                                                num_items)
+num_workers = 0 if sys.platform.startswith("win") else 4
 train_iter = gluon.data.DataLoader(train_inter_mat, shuffle=True, 
-                                   last_batch="rollover", batch_size=128)
+                                   last_batch="rollover", batch_size=256,
+                                   num_workers=num_workers)
 test_iter = gluon.data.DataLoader(np.array(train_inter_mat),shuffle=False, 
-                                  last_batch="keep", batch_size=1024)
+                                  last_batch="keep", batch_size=1024,
+                                  num_workers=num_workers)
 # Model initialization, training, and evaluation
 net = AutoRec(500, num_users)
 net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
-lr, num_epochs, wd, optimizer = 0.001, 50, 1e-5, 'adam'
+lr, num_epochs, wd, optimizer = 0.002, 25, 1e-5, 'adam'
 loss = gluon.loss.L2Loss()
 trainer = gluon.Trainer(net.collect_params(), optimizer,
                         {"learning_rate": lr, 'wd': wd})
@@ -118,3 +122,7 @@ d2l.train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
 ## References
 
 * Sedhain, Suvash, et al. "AutoRec: Autoencoders meet collaborative filtering." Proceedings of the 24th International Conference on World Wide Web. ACM, 2015.
+
+```{.python .input}
+
+```
diff --git a/chapter_recommender-systems/deepfm.md b/chapter_recommender-systems/deepfm.md
@@ -30,12 +30,13 @@ $$
 where $\sigma$ is the sigmoid function. The architecture of DeepFM is illustrated below.
 ![Illustration of the DeepFM model](../img/rec-deepfm.svg)
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 import d2l
 from mxnet import autograd, init, gluon, np, npx
 from mxnet.gluon.data import Dataset
 from mxnet.gluon import nn
 import mxnet as mx
+import sys
 npx.set_np()
 ```
 
@@ -72,24 +73,27 @@ class DeepFM(nn.Block):
 ## Training and Evaluating the Model
 The data loading process is the same as that of FM. We set the MLP component of DeepFM to a three-layered dense network with the a pyramid structure (30-20-10). All other hyperparameters remain the same as FM.
 
-```{.python .input  n=3}
+```{.python .input  n=4}
 batch_size = 2048
 d2l.read_data_ctr()
 train_data = d2l.CTRDataset(data_path="../data/ctr/train.csv")
 test_data = d2l.CTRDataset(data_path="../data/ctr/test.csv", 
                       feat_mapper=train_data.feat_mapper, 
                       defaults=train_data.defaults)
 field_dims = train_data.field_dims
+num_workers = 0 if sys.platform.startswith("win") else 4
 train_iter = gluon.data.DataLoader(train_data, shuffle=True, 
                                    last_batch="rollover", 
-                                   batch_size=batch_size)
+                                   batch_size=batch_size,
+                                   num_workers=num_workers)
 test_iter = gluon.data.DataLoader(test_data, shuffle=False,
                                   last_batch="rollover",
-                                  batch_size=batch_size)
+                                  batch_size=batch_size,
+                                  num_workers=num_workers)
 ctx = d2l.try_all_gpus()
 net = DeepFM(field_dims, num_factors=10, mlp_dims=[30, 20, 10])
 net.initialize(init.Xavier(), ctx=ctx)
-lr, num_epochs, optimizer = 0.01, 50, 'adam'
+lr, num_epochs, optimizer = 0.01, 30, 'adam'
 trainer = gluon.Trainer(net.collect_params(), optimizer, 
                         {'learning_rate': lr})
 loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()

diff --git a/chapter_recommender-systems/fm.md b/chapter_recommender-systems/fm.md
@@ -32,11 +32,12 @@ With this reformulation, the model complexity are decreased greatly. Moreover, f
 
 To learn the FM model, we can use the MSE loss for regression task, the cross entroy loss for classifcation taks, and the BPR loss for ranking task. Standard optimizers such as SGD and Adam are viable for optimization.
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 import d2l
 from mxnet import autograd, init, gluon, np, npx
 from mxnet.gluon import nn
 import mxnet as mx
+import sys
 npx.set_np()
 ```
 
@@ -70,12 +71,15 @@ train_data = d2l.CTRDataset(data_path="../data/ctr/train.csv")
 test_data = d2l.CTRDataset(data_path="../data/ctr/test.csv", 
                       feat_mapper=train_data.feat_mapper, 
                       defaults=train_data.defaults)
+num_workers = 0 if sys.platform.startswith("win") else 4
 train_iter = gluon.data.DataLoader(train_data, shuffle=True, 
                                    last_batch="rollover", 
-                                   batch_size=batch_size)
+                                   batch_size=batch_size,
+                                   num_workers=num_workers)
 test_iter = gluon.data.DataLoader(test_data, shuffle=False,
                                   last_batch="rollover", 
-                                  batch_size=batch_size)
+                                  batch_size=batch_size,
+                                  num_workers=num_workers)
 ```
 
 ## Train the Model
@@ -85,7 +89,7 @@ Afterwards, we train the model. The learning rate is set to 0.01 and the embeddi
 ctx = d2l.try_all_gpus()
 net = FM(train_data.field_dims, num_factors=20)
 net.initialize(init.Xavier(), ctx=ctx)
-lr, num_epochs, optimizer = 0.02, 50, 'adam'
+lr, num_epochs, optimizer = 0.02, 30, 'adam'
 trainer = gluon.Trainer(net.collect_params(), optimizer, 
                         {'learning_rate': lr})
 loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()

diff --git a/chapter_recommender-systems/mf.md b/chapter_recommender-systems/mf.md
@@ -9,14 +9,14 @@ a lot of attention to the field of recommender system research. Subsequently, th
 
 Matrix factorization is a class of collaborative filtering models. Specifically, the model factorizes the user-item interaction matrix (e.g., rating matrix) into the product of two lower-rank matrices, capturing the low-rank structure of the user-item interactions.
 
-Let $\mathbf{R} \in \mathbb{R}^{m \times n}$ denote the interaction matrix with $m$ users and $n$ items, and the values of $\mathbf{R}$ represent explicit ratings. The user-item interaction will be factorized into a user latent matrix $\mathbf{P} \in \mathbb{R}^{m \times k}$ and an item latent matrix $\mathbf{Q} \in \mathbb{R}^{n \times k}$, where $k \ll m, n$, is the latent factor size. For a given item $i$, the elements of $\mathbf{Q}_i$ measure the extent to which the item possesses those characteristics such as the genres and languages of a movie. For a given user $u$, the elements of $\mathbf{P}_u$ measure the extent of interest the user has in items' corresponding characteristics. These latent factors might measure obvious dimensions as mentioned in those examples or are completely uninterpretable. The predicted ratings can be estimated by
+Let $\mathbf{R} \in \mathbb{R}^{m \times n}$ denote the interaction matrix with $m$ users and $n$ items, and the values of $\mathbf{R}$ represent explicit ratings. The user-item interaction will be factorized into a user latent matrix $\mathbf{P} \in \mathbb{R}^{m \times k}$ and an item latent matrix $\mathbf{Q} \in \mathbb{R}^{n \times k}$, where $k \ll m, n$, is the latent factor size. Let $\mathbf{p}_u$ denote the $u^\mathrm{th}$ row of $\mathbf{P}$ and $\mathbf{q}_i$ denote the $i^\mathrm{th}$ row of $\mathbf{Q}$.  For a given item $i$, the elements of $\mathbf{q}_i$ measure the extent to which the item possesses those characteristics such as the genres and languages of a movie. For a given user $u$, the elements of $\mathbf{p}_u$ measure the extent of interest the user has in items' corresponding characteristics. These latent factors might measure obvious dimensions as mentioned in those examples or are completely uninterpretable. The predicted ratings can be estimated by
 
 $$\hat{\mathbf{R}} = \mathbf{PQ}^\top$$
 
 where $\hat{\mathbf{R}}\in \mathbb{R}^{m \times n}$ is the predicted rating matrix which has the same shape as $\mathbf{R}$. One major problem of this prediction rule is that users/items biases can not be modeled. For example, some users tend to give higher ratings or some items always get lower ratings due to poorer quality. These biases are commonplace in real-world applications. To capture these biases, user specific and item specific bias terms are introduced. Specifically, the predicted rating user $u$ gives to item $i$ is calculated by
 
 $$
-\hat{\mathbf{R}}_{ui} = \mathbf{P}_u\mathbf{Q}^\top_i + b_u + b_i
+\hat{\mathbf{R}}_{ui} = \mathbf{p}_u\mathbf{q}^\top_i + b_u + b_i
 $$
 
 Then, we train the matrix factorization model by minimizing the mean squared error between predicted rating scores and real rating scores.  The objective function is defined as follows:
@@ -42,6 +42,7 @@ import d2l
 from mxnet import autograd, init, gluon, np, npx
 from mxnet.gluon import nn
 import mxnet as mx
+import sys
 npx.set_np()
 ```
 
@@ -102,7 +103,7 @@ def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
                         ctx_list=d2l.try_all_gpus(), evaluator=None,
                         **kwargs):
     num_batches, timer = len(train_iter), d2l.Timer()
-    animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 2],
+    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
                             legend=['train loss','test RMSE'])
     for epoch in range(num_epochs):
         metric, l = d2l.Accumulator(3), 0.
@@ -135,15 +136,15 @@ def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
           % (metric[2] * num_epochs / timer.sum(), ctx_list))
 ```
 
-Finally,  let us put all things together and train the model. Here, we set the latent factor dimension to 50.
+Finally,  let us put all things together and train the model. Here, we set the latent factor dimension to 30.
 
 ```{.python .input  n=5}
 ctx = d2l.try_all_gpus()
 num_users, num_items, train_iter, test_iter = d2l.split_and_load_ml100k(
-    test_ratio=0.1, batch_size=128)
-net = MF(50, num_users, num_items)
+    test_ratio=0.1, batch_size=512)
+net = MF(30, num_users, num_items)
 net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
-lr, num_epochs, wd, optimizer = 0.001, 25, 1e-5, 'adam'
+lr, num_epochs, wd, optimizer = 0.002, 20, 1e-5, 'adam'
 loss = gluon.loss.L2Loss()
 trainer = gluon.Trainer(net.collect_params(), optimizer,
                         {"learning_rate": lr, 'wd': wd})

diff --git a/chapter_recommender-systems/neumf.md b/chapter_recommender-systems/neumf.md
@@ -36,13 +36,14 @@ The following figure illustrates the model architecture of NeuMF.
 
 ![Illustration of the NeuMF model](../img/rec-neumf.svg)
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 import d2l
 from mxnet import autograd, init, gluon, np, npx
 from mxnet.gluon import nn
 import mxnet as mx
 import math
 import random
+import sys
 npx.set_np()
 ```
 
@@ -160,9 +161,9 @@ The training function is defined below. We train the model in the pairwise manne
 # Saved in the d2l package for later use
 def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter, 
                   num_users, num_items, num_epochs, ctx_list, evaluator, 
-                  negative_sampler, candidates, eval_step=2):
+                  negative_sampler, candidates, eval_step=1):
     num_batches, timer, hit_rate, auc  = len(train_iter), d2l.Timer(), 0, 0
-    animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1],
+    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
                             legend=['test hit rate', 'test AUC'])
     for epoch in range(num_epochs):
         metric, l = d2l.Accumulator(3), 0.
@@ -182,7 +183,7 @@ def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
             metric.add(l, values[0].shape[0], values[0].size)
             timer.stop()
         with autograd.predict_mode():
-            if (epoch + 1) % eval_step == 1:
+            if (epoch + 1) % eval_step == 0:
                 hit_rate, auc = evaluator(net, test_iter, test_seq_iter, 
                                           candidates, num_users, num_items, 
                                           ctx_list)
@@ -197,31 +198,33 @@ def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
 Now, we can load the MovieLens 100k dataset and train the model. Since there are only ratings in the MovieLens dataset, with some losses of accuracy, we binarize these ratings to zeros and ones. If a user rated an item, we consider the implicit feedback as one, otherwise as zero. The action of rating an item can be treated as a form of providing implicit feedback.  Here, we split the dataset in the `seq-aware` mode where users' latest interacted items are left out for test.
 
 ```{.python .input  n=7}
-batch_size = 256
+batch_size = 1024
 df, num_users, num_items = d2l.read_data_ml100k()
 train_data, test_data = d2l.split_data_ml100k(df, num_users, num_items, 
                                               'seq-aware')
 users_train, items_train, ratings_train, candidates = d2l.load_data_ml100k(
     train_data, num_users, num_items, feedback="implicit" )
 users_test, items_test, ratings_test, test_iter = d2l.load_data_ml100k(
     test_data, num_users, num_items, feedback="implicit")
+num_workers = 0 if sys.platform.startswith("win") else 4
 train_iter = gluon.data.DataLoader(gluon.data.ArrayDataset(
     np.array(users_train), np.array(items_train)), batch_size, True, 
-                                   last_batch="rollover")
+                                   last_batch="rollover", 
+                                   num_workers=num_workers)
 ```
 
-We then create and initialize the model. we use a four-layer MLP with constant hidden size 20.
+We then create and initialize the model. we use a three-layer MLP with constant hidden size 10.
 
 ```{.python .input  n=8}
 ctx = d2l.try_all_gpus() 
-net = NeuMF(30, num_users, num_items, mlp_layers=[10, 10, 10, 10])
+net = NeuMF(10, num_users, num_items, mlp_layers=[10, 10, 10])
 net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
 ```
 
 The following code trains the model.
 
-```{.python .input}
-lr, num_epochs, wd, optimizer = 0.001, 15, 1e-5, 'adam'
+```{.python .input  n=9}
+lr, num_epochs, wd, optimizer = 0.01, 10, 1e-5, 'adam'
 loss = d2l.BPRLoss()
 trainer = gluon.Trainer(net.collect_params(), optimizer, 
                         {"learning_rate": lr, 'wd': wd})

diff --git a/chapter_recommender-systems/seqrec.md b/chapter_recommender-systems/seqrec.md
@@ -43,12 +43,13 @@ The model can be learned with BPR or Hinge loss. The architecture of Caser is sh
 
 We firstly import the required libraries.
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 import d2l
 from mxnet import autograd, init, gluon, np, npx
 from mxnet.gluon.data import Dataset
 from mxnet.gluon import nn
 import mxnet as mx
+import sys
 npx.set_np()
 ```
 
@@ -165,8 +166,10 @@ users_test, items_test, ratings_test, test_iter = d2l.load_data_ml100k(
     test_data, num_users, num_items, feedback="implicit")
 train_seq_data = SeqDataset(users_train, items_train, L, num_users,
                             num_items)
+num_workers = 0 if sys.platform.startswith("win") else 4
 train_iter = gluon.data.DataLoader(train_seq_data, batch_size, True,
-                                   last_batch="rollover")
+                                   last_batch="rollover", 
+                                   num_workers=num_workers)
 test_seq_iter = train_seq_data.test_seq
 train_seq_data[0]
 ```
@@ -178,16 +181,16 @@ Now, let us train the model. We use the same setting as NeuMF, including learnin
 
 ```{.python .input  n=5}
 ctx = d2l.try_all_gpus()
-net = Caser(30, num_users, num_items, L)
+net = Caser(10, num_users, num_items, L)
 net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
-lr, num_epochs, wd, optimizer = 0.001, 16, 1e-5, 'adam'
+lr, num_epochs, wd, optimizer = 0.01, 6, 1e-5, 'adam'
 loss = d2l.BPRLoss()
 trainer = gluon.Trainer(net.collect_params(), optimizer,
                         {"learning_rate": lr, 'wd': wd})
 
 d2l.train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
                   num_users, num_items, num_epochs, ctx, d2l.evaluate_ranking,
-                  d2l.negative_sampler, candidates, eval_step=5)
+                  d2l.negative_sampler, candidates, eval_step=1)
 ```
 
 ## Summary

diff --git a/d2l/d2l.py b/d2l/d2l.py
@@ -928,8 +928,7 @@ def show_trace_2d(f, results):
 def get_data_ch10(batch_size=10, n=1500):
     data = np.genfromtxt('../data/airfoil_self_noise.dat', dtype=np.float32, delimiter='\t')
     data = (data - data.mean(axis=0)) / data.std(axis=0)
-    data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]),
-                               batch_size, is_train=True)
+    data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]), batch_size, is_train=True)
     return data_iter, data.shape[1]-1
 
 
@@ -963,14 +962,12 @@ def train_ch10(trainer_fn, states, hyperparams, data_iter,
 
 
 # Defined in file: ./chapter_optimization/minibatch-sgd.md
-def train_gluon_ch10(trainer_name, trainer_hyperparams,
-                     data_iter, num_epochs=2):
+def train_gluon_ch10(tr_name, hyperparams, data_iter, num_epochs=2):
     # Initialization
     net = nn.Sequential()
     net.add(nn.Dense(1))
     net.initialize(init.Normal(sigma=0.01))
-    trainer = gluon.Trainer(
-        net.collect_params(), trainer_name, trainer_hyperparams)
+    trainer = gluon.Trainer(net.collect_params(), tr_name, hyperparams)
     loss = gluon.loss.L2Loss()
     animator = d2l.Animator(xlabel='epoch', ylabel='loss',
                             xlim=[0, num_epochs], ylim=[0.22, 0.35])
@@ -1489,7 +1486,7 @@ def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
                         ctx_list=d2l.try_all_gpus(), evaluator=None,
                         **kwargs):
     num_batches, timer = len(train_iter), d2l.Timer()
-    animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 2],
+    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
                             legend=['train loss','test RMSE'])
     for epoch in range(num_epochs):
         metric, l = d2l.Accumulator(3), 0.
@@ -1600,9 +1597,9 @@ def evaluate_ranking(net, test_input, seq, candidates, num_users, num_items,
 # Defined in file: ./chapter_recommender-systems/neumf.md
 def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter, 
                   num_users, num_items, num_epochs, ctx_list, evaluator, 
-                  negative_sampler, candidates, eval_step=2):
+                  negative_sampler, candidates, eval_step=1):
     num_batches, timer, hit_rate, auc  = len(train_iter), d2l.Timer(), 0, 0
-    animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1],
+    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
                             legend=['test hit rate', 'test AUC'])
     for epoch in range(num_epochs):
         metric, l = d2l.Accumulator(3), 0.
@@ -1622,7 +1619,7 @@ def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
             metric.add(l, values[0].shape[0], values[0].size)
             timer.stop()
         with autograd.predict_mode():
-            if (epoch + 1) % eval_step == 1:
+            if (epoch + 1) % eval_step == 0:
                 hit_rate, auc = evaluator(net, test_iter, test_seq_iter, 
                                           candidates, num_users, num_items, 
                                           ctx_list)

diff --git a/graffle/recsys/rec-deepfm.graffle b/graffle/recsys/rec-deepfm.graffle
diff --git a/graffle/recsys/rec-neumf.graffle b/graffle/recsys/rec-neumf.graffle