Skip to content

Commit

Permalink
reduce the training time (d2l-ai#558)
Browse files Browse the repository at this point in the history
  • Loading branch information
cheungdaven authored and astonzhang committed Dec 9, 2019
1 parent 808e085 commit efa5a13
Show file tree
Hide file tree
Showing 11 changed files with 466 additions and 459 deletions.
16 changes: 12 additions & 4 deletions chapter_recommender-systems/autorec.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ $$

where $\| \cdot \|_{\mathcal{O}}$ means only the contribution of observed ratings are considered, that is, only weights that are associated with observed inputs are updated during backpropagation.

```{.python .input n=1}
```{.python .input n=2}
import d2l
from mxnet import autograd, init, gluon, np, npx
from mxnet.gluon import nn
import mxnet as mx
import sys
npx.set_np()
```

Expand Down Expand Up @@ -87,14 +88,17 @@ _, _, _, train_inter_mat = d2l.load_data_ml100k(train_data, num_users,
num_items)
_, _, _, test_inter_mat = d2l.load_data_ml100k(test_data, num_users,
num_items)
num_workers = 0 if sys.platform.startswith("win") else 4
train_iter = gluon.data.DataLoader(train_inter_mat, shuffle=True,
last_batch="rollover", batch_size=128)
last_batch="rollover", batch_size=256,
num_workers=num_workers)
test_iter = gluon.data.DataLoader(np.array(train_inter_mat),shuffle=False,
last_batch="keep", batch_size=1024)
last_batch="keep", batch_size=1024,
num_workers=num_workers)
# Model initialization, training, and evaluation
net = AutoRec(500, num_users)
net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
lr, num_epochs, wd, optimizer = 0.001, 50, 1e-5, 'adam'
lr, num_epochs, wd, optimizer = 0.002, 25, 1e-5, 'adam'
loss = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(), optimizer,
{"learning_rate": lr, 'wd': wd})
Expand All @@ -118,3 +122,7 @@ d2l.train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
## References

* Sedhain, Suvash, et al. "AutoRec: Autoencoders meet collaborative filtering." Proceedings of the 24th International Conference on World Wide Web. ACM, 2015.

```{.python .input}
```
14 changes: 9 additions & 5 deletions chapter_recommender-systems/deepfm.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ $$
where $\sigma$ is the sigmoid function. The architecture of DeepFM is illustrated below.
![Illustration of the DeepFM model](../img/rec-deepfm.svg)

```{.python .input n=1}
```{.python .input n=2}
import d2l
from mxnet import autograd, init, gluon, np, npx
from mxnet.gluon.data import Dataset
from mxnet.gluon import nn
import mxnet as mx
import sys
npx.set_np()
```

Expand Down Expand Up @@ -72,24 +73,27 @@ class DeepFM(nn.Block):
## Training and Evaluating the Model
The data loading process is the same as that of FM. We set the MLP component of DeepFM to a three-layered dense network with the a pyramid structure (30-20-10). All other hyperparameters remain the same as FM.

```{.python .input n=3}
```{.python .input n=4}
batch_size = 2048
d2l.read_data_ctr()
train_data = d2l.CTRDataset(data_path="../data/ctr/train.csv")
test_data = d2l.CTRDataset(data_path="../data/ctr/test.csv",
feat_mapper=train_data.feat_mapper,
defaults=train_data.defaults)
field_dims = train_data.field_dims
num_workers = 0 if sys.platform.startswith("win") else 4
train_iter = gluon.data.DataLoader(train_data, shuffle=True,
last_batch="rollover",
batch_size=batch_size)
batch_size=batch_size,
num_workers=num_workers)
test_iter = gluon.data.DataLoader(test_data, shuffle=False,
last_batch="rollover",
batch_size=batch_size)
batch_size=batch_size,
num_workers=num_workers)
ctx = d2l.try_all_gpus()
net = DeepFM(field_dims, num_factors=10, mlp_dims=[30, 20, 10])
net.initialize(init.Xavier(), ctx=ctx)
lr, num_epochs, optimizer = 0.01, 50, 'adam'
lr, num_epochs, optimizer = 0.01, 30, 'adam'
trainer = gluon.Trainer(net.collect_params(), optimizer,
{'learning_rate': lr})
loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
Expand Down
12 changes: 8 additions & 4 deletions chapter_recommender-systems/fm.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ With this reformulation, the model complexity are decreased greatly. Moreover, f

To learn the FM model, we can use the MSE loss for regression task, the cross entroy loss for classifcation taks, and the BPR loss for ranking task. Standard optimizers such as SGD and Adam are viable for optimization.

```{.python .input n=1}
```{.python .input n=2}
import d2l
from mxnet import autograd, init, gluon, np, npx
from mxnet.gluon import nn
import mxnet as mx
import sys
npx.set_np()
```

Expand Down Expand Up @@ -70,12 +71,15 @@ train_data = d2l.CTRDataset(data_path="../data/ctr/train.csv")
test_data = d2l.CTRDataset(data_path="../data/ctr/test.csv",
feat_mapper=train_data.feat_mapper,
defaults=train_data.defaults)
num_workers = 0 if sys.platform.startswith("win") else 4
train_iter = gluon.data.DataLoader(train_data, shuffle=True,
last_batch="rollover",
batch_size=batch_size)
batch_size=batch_size,
num_workers=num_workers)
test_iter = gluon.data.DataLoader(test_data, shuffle=False,
last_batch="rollover",
batch_size=batch_size)
batch_size=batch_size,
num_workers=num_workers)
```

## Train the Model
Expand All @@ -85,7 +89,7 @@ Afterwards, we train the model. The learning rate is set to 0.01 and the embeddi
ctx = d2l.try_all_gpus()
net = FM(train_data.field_dims, num_factors=20)
net.initialize(init.Xavier(), ctx=ctx)
lr, num_epochs, optimizer = 0.02, 50, 'adam'
lr, num_epochs, optimizer = 0.02, 30, 'adam'
trainer = gluon.Trainer(net.collect_params(), optimizer,
{'learning_rate': lr})
loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
Expand Down
15 changes: 8 additions & 7 deletions chapter_recommender-systems/mf.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ a lot of attention to the field of recommender system research. Subsequently, th

Matrix factorization is a class of collaborative filtering models. Specifically, the model factorizes the user-item interaction matrix (e.g., rating matrix) into the product of two lower-rank matrices, capturing the low-rank structure of the user-item interactions.

Let $\mathbf{R} \in \mathbb{R}^{m \times n}$ denote the interaction matrix with $m$ users and $n$ items, and the values of $\mathbf{R}$ represent explicit ratings. The user-item interaction will be factorized into a user latent matrix $\mathbf{P} \in \mathbb{R}^{m \times k}$ and an item latent matrix $\mathbf{Q} \in \mathbb{R}^{n \times k}$, where $k \ll m, n$, is the latent factor size. For a given item $i$, the elements of $\mathbf{Q}_i$ measure the extent to which the item possesses those characteristics such as the genres and languages of a movie. For a given user $u$, the elements of $\mathbf{P}_u$ measure the extent of interest the user has in items' corresponding characteristics. These latent factors might measure obvious dimensions as mentioned in those examples or are completely uninterpretable. The predicted ratings can be estimated by
Let $\mathbf{R} \in \mathbb{R}^{m \times n}$ denote the interaction matrix with $m$ users and $n$ items, and the values of $\mathbf{R}$ represent explicit ratings. The user-item interaction will be factorized into a user latent matrix $\mathbf{P} \in \mathbb{R}^{m \times k}$ and an item latent matrix $\mathbf{Q} \in \mathbb{R}^{n \times k}$, where $k \ll m, n$, is the latent factor size. Let $\mathbf{p}_u$ denote the $u^\mathrm{th}$ row of $\mathbf{P}$ and $\mathbf{q}_i$ denote the $i^\mathrm{th}$ row of $\mathbf{Q}$. For a given item $i$, the elements of $\mathbf{q}_i$ measure the extent to which the item possesses those characteristics such as the genres and languages of a movie. For a given user $u$, the elements of $\mathbf{p}_u$ measure the extent of interest the user has in items' corresponding characteristics. These latent factors might measure obvious dimensions as mentioned in those examples or are completely uninterpretable. The predicted ratings can be estimated by

$$\hat{\mathbf{R}} = \mathbf{PQ}^\top$$

where $\hat{\mathbf{R}}\in \mathbb{R}^{m \times n}$ is the predicted rating matrix which has the same shape as $\mathbf{R}$. One major problem of this prediction rule is that users/items biases can not be modeled. For example, some users tend to give higher ratings or some items always get lower ratings due to poorer quality. These biases are commonplace in real-world applications. To capture these biases, user specific and item specific bias terms are introduced. Specifically, the predicted rating user $u$ gives to item $i$ is calculated by

$$
\hat{\mathbf{R}}_{ui} = \mathbf{P}_u\mathbf{Q}^\top_i + b_u + b_i
\hat{\mathbf{R}}_{ui} = \mathbf{p}_u\mathbf{q}^\top_i + b_u + b_i
$$

Then, we train the matrix factorization model by minimizing the mean squared error between predicted rating scores and real rating scores. The objective function is defined as follows:
Expand All @@ -42,6 +42,7 @@ import d2l
from mxnet import autograd, init, gluon, np, npx
from mxnet.gluon import nn
import mxnet as mx
import sys
npx.set_np()
```

Expand Down Expand Up @@ -102,7 +103,7 @@ def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
ctx_list=d2l.try_all_gpus(), evaluator=None,
**kwargs):
num_batches, timer = len(train_iter), d2l.Timer()
animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 2],
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
legend=['train loss','test RMSE'])
for epoch in range(num_epochs):
metric, l = d2l.Accumulator(3), 0.
Expand Down Expand Up @@ -135,15 +136,15 @@ def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
% (metric[2] * num_epochs / timer.sum(), ctx_list))
```

Finally, let us put all things together and train the model. Here, we set the latent factor dimension to 50.
Finally, let us put all things together and train the model. Here, we set the latent factor dimension to 30.

```{.python .input n=5}
ctx = d2l.try_all_gpus()
num_users, num_items, train_iter, test_iter = d2l.split_and_load_ml100k(
test_ratio=0.1, batch_size=128)
net = MF(50, num_users, num_items)
test_ratio=0.1, batch_size=512)
net = MF(30, num_users, num_items)
net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
lr, num_epochs, wd, optimizer = 0.001, 25, 1e-5, 'adam'
lr, num_epochs, wd, optimizer = 0.002, 20, 1e-5, 'adam'
loss = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(), optimizer,
{"learning_rate": lr, 'wd': wd})
Expand Down
23 changes: 13 additions & 10 deletions chapter_recommender-systems/neumf.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ The following figure illustrates the model architecture of NeuMF.

![Illustration of the NeuMF model](../img/rec-neumf.svg)

```{.python .input n=1}
```{.python .input n=2}
import d2l
from mxnet import autograd, init, gluon, np, npx
from mxnet.gluon import nn
import mxnet as mx
import math
import random
import sys
npx.set_np()
```

Expand Down Expand Up @@ -160,9 +161,9 @@ The training function is defined below. We train the model in the pairwise manne
# Saved in the d2l package for later use
def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
num_users, num_items, num_epochs, ctx_list, evaluator,
negative_sampler, candidates, eval_step=2):
negative_sampler, candidates, eval_step=1):
num_batches, timer, hit_rate, auc = len(train_iter), d2l.Timer(), 0, 0
animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1],
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
legend=['test hit rate', 'test AUC'])
for epoch in range(num_epochs):
metric, l = d2l.Accumulator(3), 0.
Expand All @@ -182,7 +183,7 @@ def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
metric.add(l, values[0].shape[0], values[0].size)
timer.stop()
with autograd.predict_mode():
if (epoch + 1) % eval_step == 1:
if (epoch + 1) % eval_step == 0:
hit_rate, auc = evaluator(net, test_iter, test_seq_iter,
candidates, num_users, num_items,
ctx_list)
Expand All @@ -197,31 +198,33 @@ def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
Now, we can load the MovieLens 100k dataset and train the model. Since there are only ratings in the MovieLens dataset, with some losses of accuracy, we binarize these ratings to zeros and ones. If a user rated an item, we consider the implicit feedback as one, otherwise as zero. The action of rating an item can be treated as a form of providing implicit feedback. Here, we split the dataset in the `seq-aware` mode where users' latest interacted items are left out for test.

```{.python .input n=7}
batch_size = 256
batch_size = 1024
df, num_users, num_items = d2l.read_data_ml100k()
train_data, test_data = d2l.split_data_ml100k(df, num_users, num_items,
'seq-aware')
users_train, items_train, ratings_train, candidates = d2l.load_data_ml100k(
train_data, num_users, num_items, feedback="implicit" )
users_test, items_test, ratings_test, test_iter = d2l.load_data_ml100k(
test_data, num_users, num_items, feedback="implicit")
num_workers = 0 if sys.platform.startswith("win") else 4
train_iter = gluon.data.DataLoader(gluon.data.ArrayDataset(
np.array(users_train), np.array(items_train)), batch_size, True,
last_batch="rollover")
last_batch="rollover",
num_workers=num_workers)
```

We then create and initialize the model. we use a four-layer MLP with constant hidden size 20.
We then create and initialize the model. we use a three-layer MLP with constant hidden size 10.

```{.python .input n=8}
ctx = d2l.try_all_gpus()
net = NeuMF(30, num_users, num_items, mlp_layers=[10, 10, 10, 10])
net = NeuMF(10, num_users, num_items, mlp_layers=[10, 10, 10])
net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
```

The following code trains the model.

```{.python .input}
lr, num_epochs, wd, optimizer = 0.001, 15, 1e-5, 'adam'
```{.python .input n=9}
lr, num_epochs, wd, optimizer = 0.01, 10, 1e-5, 'adam'
loss = d2l.BPRLoss()
trainer = gluon.Trainer(net.collect_params(), optimizer,
{"learning_rate": lr, 'wd': wd})
Expand Down
13 changes: 8 additions & 5 deletions chapter_recommender-systems/seqrec.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,13 @@ The model can be learned with BPR or Hinge loss. The architecture of Caser is sh

We firstly import the required libraries.

```{.python .input n=1}
```{.python .input n=2}
import d2l
from mxnet import autograd, init, gluon, np, npx
from mxnet.gluon.data import Dataset
from mxnet.gluon import nn
import mxnet as mx
import sys
npx.set_np()
```

Expand Down Expand Up @@ -165,8 +166,10 @@ users_test, items_test, ratings_test, test_iter = d2l.load_data_ml100k(
test_data, num_users, num_items, feedback="implicit")
train_seq_data = SeqDataset(users_train, items_train, L, num_users,
num_items)
num_workers = 0 if sys.platform.startswith("win") else 4
train_iter = gluon.data.DataLoader(train_seq_data, batch_size, True,
last_batch="rollover")
last_batch="rollover",
num_workers=num_workers)
test_seq_iter = train_seq_data.test_seq
train_seq_data[0]
```
Expand All @@ -178,16 +181,16 @@ Now, let us train the model. We use the same setting as NeuMF, including learnin

```{.python .input n=5}
ctx = d2l.try_all_gpus()
net = Caser(30, num_users, num_items, L)
net = Caser(10, num_users, num_items, L)
net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
lr, num_epochs, wd, optimizer = 0.001, 16, 1e-5, 'adam'
lr, num_epochs, wd, optimizer = 0.01, 6, 1e-5, 'adam'
loss = d2l.BPRLoss()
trainer = gluon.Trainer(net.collect_params(), optimizer,
{"learning_rate": lr, 'wd': wd})
d2l.train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
num_users, num_items, num_epochs, ctx, d2l.evaluate_ranking,
d2l.negative_sampler, candidates, eval_step=5)
d2l.negative_sampler, candidates, eval_step=1)
```

## Summary
Expand Down
17 changes: 7 additions & 10 deletions d2l/d2l.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,8 +928,7 @@ def show_trace_2d(f, results):
def get_data_ch10(batch_size=10, n=1500):
data = np.genfromtxt('../data/airfoil_self_noise.dat', dtype=np.float32, delimiter='\t')
data = (data - data.mean(axis=0)) / data.std(axis=0)
data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]),
batch_size, is_train=True)
data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]), batch_size, is_train=True)
return data_iter, data.shape[1]-1


Expand Down Expand Up @@ -963,14 +962,12 @@ def train_ch10(trainer_fn, states, hyperparams, data_iter,


# Defined in file: ./chapter_optimization/minibatch-sgd.md
def train_gluon_ch10(trainer_name, trainer_hyperparams,
data_iter, num_epochs=2):
def train_gluon_ch10(tr_name, hyperparams, data_iter, num_epochs=2):
# Initialization
net = nn.Sequential()
net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(
net.collect_params(), trainer_name, trainer_hyperparams)
trainer = gluon.Trainer(net.collect_params(), tr_name, hyperparams)
loss = gluon.loss.L2Loss()
animator = d2l.Animator(xlabel='epoch', ylabel='loss',
xlim=[0, num_epochs], ylim=[0.22, 0.35])
Expand Down Expand Up @@ -1489,7 +1486,7 @@ def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
ctx_list=d2l.try_all_gpus(), evaluator=None,
**kwargs):
num_batches, timer = len(train_iter), d2l.Timer()
animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 2],
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
legend=['train loss','test RMSE'])
for epoch in range(num_epochs):
metric, l = d2l.Accumulator(3), 0.
Expand Down Expand Up @@ -1600,9 +1597,9 @@ def evaluate_ranking(net, test_input, seq, candidates, num_users, num_items,
# Defined in file: ./chapter_recommender-systems/neumf.md
def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
num_users, num_items, num_epochs, ctx_list, evaluator,
negative_sampler, candidates, eval_step=2):
negative_sampler, candidates, eval_step=1):
num_batches, timer, hit_rate, auc = len(train_iter), d2l.Timer(), 0, 0
animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1],
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
legend=['test hit rate', 'test AUC'])
for epoch in range(num_epochs):
metric, l = d2l.Accumulator(3), 0.
Expand All @@ -1622,7 +1619,7 @@ def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter,
metric.add(l, values[0].shape[0], values[0].size)
timer.stop()
with autograd.predict_mode():
if (epoch + 1) % eval_step == 1:
if (epoch + 1) % eval_step == 0:
hit_rate, auc = evaluator(net, test_iter, test_seq_iter,
candidates, num_users, num_items,
ctx_list)
Expand Down
Binary file modified graffle/recsys/rec-deepfm.graffle
Binary file not shown.
Binary file modified graffle/recsys/rec-neumf.graffle
Binary file not shown.
Loading

0 comments on commit efa5a13

Please sign in to comment.