Skip to content

Commit

Permalink
Fix code blocks newline issue in tab
Browse files Browse the repository at this point in the history
  • Loading branch information
astonzhang committed Feb 9, 2023
1 parent 55a605f commit fc271a9
Show file tree
Hide file tree
Showing 20 changed files with 217 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,12 @@ if tab.selected('tensorflow'):
if tab.selected('jax'):
def epanechikov(x):
return jnp.maximum(1 - d2l.abs(x), 0)
```

```{.python .input}
%%tab all
kernels = (gaussian, boxcar, constant, epanechikov)
names = ('Gaussian', 'Boxcar', 'Constant', 'Epanechikov')
x = d2l.arange(-2.5, 2.5, 0.1)
for kernel, name, ax in zip(kernels, names, axes):
if tab.selected('pytorch', 'mxnet', 'tensorflow'):
Expand Down Expand Up @@ -179,7 +182,10 @@ def plot(x_train, y_train, x_val, y_val, kernels, names, attention=False):
ax.legend(['y_hat', 'y'])
if attention:
fig.colorbar(pcm, ax=axes, shrink=0.7)
```

```{.python .input}
%%tab all
plot(x_train, y_train, x_val, y_val, kernels, names)
```

Expand All @@ -206,7 +212,6 @@ def gaussian_with_width(sigma):
return (lambda x: d2l.exp(-x**2 / (2*sigma**2)))
kernels = [gaussian_with_width(sigma) for sigma in sigmas]
plot(x_train, y_train, x_val, y_val, kernels, names)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,18 @@ As a result,
the shape of the multi-head attention output
is (`batch_size`, `num_queries`, `num_hiddens`).

```{.python .input}
%%tab pytorch
num_hiddens, num_heads = 100, 5
attention = MultiHeadAttention(num_hiddens, num_heads, 0.5)
batch_size, num_queries, num_kvpairs = 2, 4, 6
valid_lens = d2l.tensor([3, 2])
X = d2l.ones((batch_size, num_queries, num_hiddens))
Y = d2l.ones((batch_size, num_kvpairs, num_hiddens))
d2l.check_shape(attention(X, Y, Y, valid_lens),
(batch_size, num_queries, num_hiddens))
```

```{.python .input}
%%tab mxnet
num_hiddens, num_heads = 100, 5
Expand All @@ -401,7 +413,7 @@ attention.initialize()
```

```{.python .input}
%%tab pytorch, jax
%%tab jax
num_hiddens, num_heads = 100, 5
attention = MultiHeadAttention(num_hiddens, num_heads, 0.5)
```
Expand All @@ -414,7 +426,7 @@ attention = MultiHeadAttention(num_hiddens, num_hiddens, num_hiddens,
```

```{.python .input}
%%tab mxnet, pytorch
%%tab mxnet
batch_size, num_queries, num_kvpairs = 2, 4, 6
valid_lens = d2l.tensor([3, 2])
X = d2l.ones((batch_size, num_queries, num_hiddens))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@ where weights are derived according to the compatibility between a query $\mathb

What is quite remarkable is that the actual "code" to execute on the set of keys and values, namely the query, can be quite concise, even though the space to operate on is significant. This is a desirable property for a network layer as it does not require too many parameters to learn. Just as convenient is the fact that attention can operate on arbitrarily large databases without the need to change the way the attention pooling operation is performed.

## Visualization

One of the benefits of the attention mechanism is that it can be quite intuitive, particularly when the weights are nonnegative and sum to $1$. In this case we might *interpret* large weights as a way for the model to select components of relevance. While this is a good intuition, it is important to remember that it is just that, an *intuition*. Regardless, we may want to visualize its effect on the given set of keys, when applying a variety of different queries. This function will come in handy later.

```{.python .input}
%%tab mxnet
from d2l import mxnet as d2l
Expand All @@ -74,6 +70,10 @@ from d2l import jax as d2l
from jax import numpy as jnp
```

## Visualization

One of the benefits of the attention mechanism is that it can be quite intuitive, particularly when the weights are nonnegative and sum to $1$. In this case we might *interpret* large weights as a way for the model to select components of relevance. While this is a good intuition, it is important to remember that it is just that, an *intuition*. Regardless, we may want to visualize its effect on the given set of keys, when applying a variety of different queries. This function will come in handy later.

We thus define the `show_heatmaps` function. Note that it does not take a matrix (of attention weights) as its input but rather a tensor with 4 axes, allowing for an array of different queries and weights. Consequently the input `matrices` has the shape (number of rows for display, number of columns for display, number of queries, number of keys). This will come in handy later on when we want to visualize the workings of :numref:`sec_multihead-attention` that is used to design Transformers.

```{.python .input n=17}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,16 @@ computes the self-attention of a tensor
with shape (batch size, number of time steps or sequence length in tokens, $d$).
The output tensor has the same shape.

```{.python .input}
%%tab pytorch
num_hiddens, num_heads = 100, 5
attention = d2l.MultiHeadAttention(num_hiddens, num_heads, 0.5)
batch_size, num_queries, valid_lens = 2, 4, d2l.tensor([3, 2])
X = d2l.ones((batch_size, num_queries, num_hiddens))
d2l.check_shape(attention(X, X, X, valid_lens),
(batch_size, num_queries, num_hiddens))
```

```{.python .input}
%%tab mxnet
num_hiddens, num_heads = 100, 5
Expand All @@ -85,7 +95,7 @@ attention.initialize()
```

```{.python .input}
%%tab pytorch, jax
%%tab jax
num_hiddens, num_heads = 100, 5
attention = d2l.MultiHeadAttention(num_hiddens, num_heads, 0.5)
```
Expand All @@ -98,7 +108,7 @@ attention = d2l.MultiHeadAttention(num_hiddens, num_hiddens, num_hiddens,
```

```{.python .input}
%%tab mxnet, pytorch
%%tab mxnet
batch_size, num_queries, valid_lens = 2, 4, d2l.tensor([3, 2])
X = d2l.ones((batch_size, num_queries, num_hiddens))
d2l.check_shape(attention(X, X, X, valid_lens),
Expand Down
10 changes: 6 additions & 4 deletions chapter_attention-mechanisms-and-transformers/transformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,8 @@ is (number of encoder layers, number of attention heads, `num_steps` or number o
%%tab pytorch, mxnet, tensorflow
_, dec_attention_weights = model.predict_step(
data.build([engs[-1]], [fras[-1]]), d2l.try_gpu(), data.num_steps, True)
enc_attention_weights = d2l.reshape(d2l.concat(model.encoder.attention_weights, 0),
enc_attention_weights = d2l.concat(model.encoder.attention_weights, 0)
enc_attention_weights = d2l.reshape(enc_attention_weights,
(num_blks, num_heads, -1, data.num_steps))
d2l.check_shape(enc_attention_weights,
(num_blks, num_heads, data.num_steps, data.num_steps))
Expand All @@ -1244,7 +1245,8 @@ d2l.check_shape(enc_attention_weights,
_, (dec_attention_weights, enc_attention_weights) = model.predict_step(
trainer.state.params, data.build([engs[-1]], [fras[-1]]),
data.num_steps, True)
enc_attention_weights = d2l.reshape(d2l.concat(enc_attention_weights, 0),
enc_attention_weights = d2l.concat(enc_attention_weights, 0)
enc_attention_weights = d2l.reshape(enc_attention_weights,
(num_blks, num_heads, -1, data.num_steps))
d2l.check_shape(enc_attention_weights,
(num_blks, num_heads, data.num_steps, data.num_steps))
Expand Down Expand Up @@ -1308,8 +1310,8 @@ dec_attention_weights_2d = [head[0].tolist()
for attn in step for blk in attn for head in blk]
dec_attention_weights_filled = d2l.tensor(
pd.DataFrame(dec_attention_weights_2d).fillna(0.0).values)
dec_attention_weights = d2l.reshape(dec_attention_weights_filled, (
-1, 2, num_blks, num_heads, data.num_steps))
shape = (-1, 2, num_blks, num_heads, data.num_steps)
dec_attention_weights = d2l.reshape(dec_attention_weights_filled, shape)
dec_self_attention_weights, dec_inter_attention_weights = \
dec_attention_weights.permute(1, 2, 3, 0, 4)
```
Expand Down
3 changes: 3 additions & 0 deletions chapter_builders-guide/init-param.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def init_normal(module):
if type(module) == nn.Linear:
nn.init.normal_(module.weight, mean=0, std=0.01)
nn.init.zeros_(module.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]
```
Expand Down Expand Up @@ -162,6 +163,7 @@ def init_constant(module):
if type(module) == nn.Linear:
nn.init.constant_(module.weight, 1)
nn.init.zeros_(module.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]
```
Expand Down Expand Up @@ -213,6 +215,7 @@ print(net[1].weight.data())
def init_xavier(module):
if type(module) == nn.Linear:
nn.init.xavier_uniform_(module.weight)
def init_42(module):
if type(module) == nn.Linear:
nn.init.constant_(module.weight, 42)
Expand Down
32 changes: 24 additions & 8 deletions chapter_builders-guide/use-gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,24 @@ from jax import numpy as jnp
```

```{.python .input}
%%tab all
%%tab pytorch
def cpu(): #@save
"""Get the CPU device."""
return torch.device('cpu')
def gpu(i=0): #@save
"""Get a GPU device."""
return torch.device(f'cuda:{i}')
cpu(), gpu(), gpu(1)
```

```{.python .input}
%%tab mxnet, tensorflow, jax
def cpu(): #@save
"""Get the CPU device."""
if tab.selected('mxnet'):
return npx.cpu()
if tab.selected('pytorch'):
return torch.device('cpu')
if tab.selected('tensorflow'):
return tf.device('/CPU:0')
if tab.selected('jax'):
Expand All @@ -169,8 +180,6 @@ def gpu(i=0): #@save
"""Get a GPU device."""
if tab.selected('mxnet'):
return npx.gpu(i)
if tab.selected('pytorch'):
return torch.device(f'cuda:{i}')
if tab.selected('tensorflow'):
return tf.device(f'/GPU:{i}')
if tab.selected('jax'):
Expand All @@ -182,13 +191,20 @@ cpu(), gpu(), gpu(1)
We can (**query the number of available GPUs.**)

```{.python .input}
%%tab all
%%tab pytorch
def num_gpus(): #@save
"""Get the number of available GPUs."""
return torch.cuda.device_count()
num_gpus()
```

```{.python .input}
%%tab mxnet, tensorflow, jax
def num_gpus(): #@save
"""Get the number of available GPUs."""
if tab.selected('mxnet'):
return npx.num_gpus()
if tab.selected('pytorch'):
return torch.cuda.device_count()
if tab.selected('tensorflow'):
return len(tf.config.experimental.list_physical_devices('GPU'))
if tab.selected('jax'):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def comp_conv2d(conv2d, X):
Y = conv2d(X)
# Strip the first two dimensions: examples and channels
return Y.reshape(Y.shape[2:])
# 1 row and column is padded on either side, so a total of 2 rows or columns
# are added
conv2d = nn.LazyConv2d(1, kernel_size=3, padding=1)
Expand Down
21 changes: 16 additions & 5 deletions chapter_linear-classification/softmax-regression-concise.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,21 @@ the dataclass.
:end_tab:

```{.python .input}
%%tab pytorch, mxnet, tensorflow
%%tab pytorch
class SoftmaxRegression(d2l.Classifier): #@save
"""The softmax regression model."""
def __init__(self, num_outputs, lr):
super().__init__()
self.save_hyperparameters()
self.net = nn.Sequential(nn.Flatten(),
nn.LazyLinear(num_outputs))
def forward(self, X):
return self.net(X)
```

```{.python .input}
%%tab mxnet, tensorflow
class SoftmaxRegression(d2l.Classifier): #@save
"""The softmax regression model."""
def __init__(self, num_outputs, lr):
Expand All @@ -89,9 +103,6 @@ class SoftmaxRegression(d2l.Classifier): #@save
if tab.selected('mxnet'):
self.net = nn.Dense(num_outputs)
self.net.initialize()
if tab.selected('pytorch'):
self.net = nn.Sequential(nn.Flatten(),
nn.LazyLinear(num_outputs))
if tab.selected('tensorflow'):
self.net = tf.keras.models.Sequential()
self.net.add(tf.keras.layers.Flatten())
Expand Down Expand Up @@ -207,7 +218,7 @@ def loss(self, params, X, Y, state, averaged=True):

## Training

Next we train our model. As before, we use Fashion-MNIST images, flattened to 784-dimensional feature vectors.
Next we train our model. We use Fashion-MNIST images, flattened to 784-dimensional feature vectors.

```{.python .input}
%%tab all
Expand Down
4 changes: 2 additions & 2 deletions chapter_linear-classification/softmax-regression-scratch.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,8 @@ before passing the data through our model.
%%tab all
@d2l.add_to_class(SoftmaxRegressionScratch)
def forward(self, X):
return softmax(d2l.matmul(d2l.reshape(
X, (-1, self.W.shape[0])), self.W) + self.b)
X = d2l.reshape(X, (-1, self.W.shape[0]))
return softmax(d2l.matmul(X, self.W) + self.b)
```

## The Cross-Entropy Loss
Expand Down
66 changes: 58 additions & 8 deletions chapter_linear-regression/oo-design.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,64 @@ using type annotations. All Flax modules are Python 3.7 dataclasses.
:end_tab:

```{.python .input}
%%tab all
%%tab pytorch
class Module(d2l.nn_Module, d2l.HyperParameters): #@save
"""The base class of models."""
if tab.selected('pytorch', 'mxnet', 'tensorflow'):
def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1):
super().__init__()
self.save_hyperparameters()
self.board = ProgressBoard()
def loss(self, y_hat, y):
raise NotImplementedError
def forward(self, X):
assert hasattr(self, 'net'), 'Neural network is defined'
return self.net(X)
def plot(self, key, value, train):
"""Plot a point in animation."""
assert hasattr(self, 'trainer'), 'Trainer is not inited'
self.board.xlabel = 'epoch'
if train:
x = self.trainer.train_batch_idx / \
self.trainer.num_train_batches
n = self.trainer.num_train_batches / \
self.plot_train_per_epoch
else:
x = self.trainer.epoch + 1
n = self.trainer.num_val_batches / \
self.plot_valid_per_epoch
if tab.selected('mxnet', 'tensorflow'):
self.board.draw(x, d2l.numpy(value), (
'train_' if train else 'val_') + key, every_n=int(n))
if tab.selected('pytorch'):
self.board.draw(x, d2l.numpy(d2l.to(value, d2l.cpu())),
('train_' if train else 'val_') + key,
every_n=int(n))
if tab.selected('jax'):
self.board.draw(x, d2l.to(value, d2l.cpu()),
('train_' if train else 'val_') + key,
every_n=int(n))
def training_step(self, batch):
l = self.loss(self(*batch[:-1]), batch[-1])
self.plot('loss', l, train=True)
return l
def validation_step(self, batch):
l = self.loss(self(*batch[:-1]), batch[-1])
self.plot('loss', l, train=False)
def configure_optimizers(self):
raise NotImplementedError
```

```{.python .input}
%%tab mxnet, tensorflow, jax
class Module(d2l.nn_Module, d2l.HyperParameters): #@save
"""The base class of models."""
if tab.selected('mxnet', 'tensorflow'):
def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1):
super().__init__()
self.save_hyperparameters()
Expand All @@ -204,7 +258,7 @@ class Module(d2l.nn_Module, d2l.HyperParameters): #@save
def loss(self, y_hat, y):
raise NotImplementedError
if tab.selected('pytorch', 'mxnet', 'tensorflow'):
if tab.selected('mxnet', 'tensorflow'):
def forward(self, X):
assert hasattr(self, 'net'), 'Neural network is defined'
return self.net(X)
Expand Down Expand Up @@ -242,16 +296,12 @@ class Module(d2l.nn_Module, d2l.HyperParameters): #@save
if tab.selected('mxnet', 'tensorflow'):
self.board.draw(x, d2l.numpy(value), (
'train_' if train else 'val_') + key, every_n=int(n))
if tab.selected('pytorch'):
self.board.draw(x, d2l.numpy(d2l.to(value, d2l.cpu())),
('train_' if train else 'val_') + key,
every_n=int(n))
if tab.selected('jax'):
self.board.draw(x, d2l.to(value, d2l.cpu()),
('train_' if train else 'val_') + key,
every_n=int(n))
if tab.selected('pytorch', 'mxnet', 'tensorflow'):
if tab.selected('mxnet', 'tensorflow'):
def training_step(self, batch):
l = self.loss(self(*batch[:-1]), batch[-1])
self.plot('loss', l, train=True)
Expand Down
Loading

0 comments on commit fc271a9

Please sign in to comment.