Fix code blocks newline issue in tab

naviat · Feb 9, 2023 · fc271a9 · fc271a9
1 parent 55a605f
commit fc271a9
Show file tree

Hide file tree

Showing 20 changed files with 217 additions and 74 deletions.
diff --git a/chapter_attention-mechanisms-and-transformers/attention-pooling.md b/chapter_attention-mechanisms-and-transformers/attention-pooling.md
@@ -90,9 +90,12 @@ if tab.selected('tensorflow'):
 if tab.selected('jax'):
     def epanechikov(x):
         return jnp.maximum(1 - d2l.abs(x), 0)
+```
+
+```{.python .input}
+%%tab all
 kernels = (gaussian, boxcar, constant, epanechikov)
 names = ('Gaussian', 'Boxcar', 'Constant', 'Epanechikov')
-
 x = d2l.arange(-2.5, 2.5, 0.1)
 for kernel, name, ax in zip(kernels, names, axes):
     if tab.selected('pytorch', 'mxnet', 'tensorflow'):
@@ -179,7 +182,10 @@ def plot(x_train, y_train, x_val, y_val, kernels, names, attention=False):
             ax.legend(['y_hat', 'y'])
     if attention:
         fig.colorbar(pcm, ax=axes, shrink=0.7)
-        
+```
+
+```{.python .input}
+%%tab all
 plot(x_train, y_train, x_val, y_val, kernels, names)
 ```
 
@@ -206,7 +212,6 @@ def gaussian_with_width(sigma):
     return (lambda x: d2l.exp(-x**2 / (2*sigma**2)))
 
 kernels = [gaussian_with_width(sigma) for sigma in sigmas]
-
 plot(x_train, y_train, x_val, y_val, kernels, names)
 ```
 

diff --git a/chapter_attention-mechanisms-and-transformers/multihead-attention.md b/chapter_attention-mechanisms-and-transformers/multihead-attention.md
@@ -393,6 +393,18 @@ As a result,
 the shape of the multi-head attention output
 is (`batch_size`, `num_queries`, `num_hiddens`).
 
+```{.python .input}
+%%tab pytorch
+num_hiddens, num_heads = 100, 5
+attention = MultiHeadAttention(num_hiddens, num_heads, 0.5)
+batch_size, num_queries, num_kvpairs = 2, 4, 6
+valid_lens = d2l.tensor([3, 2])
+X = d2l.ones((batch_size, num_queries, num_hiddens))
+Y = d2l.ones((batch_size, num_kvpairs, num_hiddens))
+d2l.check_shape(attention(X, Y, Y, valid_lens),
+                (batch_size, num_queries, num_hiddens))
+```
+
 ```{.python .input}
 %%tab mxnet
 num_hiddens, num_heads = 100, 5
@@ -401,7 +413,7 @@ attention.initialize()
 ```
 
 ```{.python .input}
-%%tab pytorch, jax
+%%tab jax
 num_hiddens, num_heads = 100, 5
 attention = MultiHeadAttention(num_hiddens, num_heads, 0.5)
 ```
@@ -414,7 +426,7 @@ attention = MultiHeadAttention(num_hiddens, num_hiddens, num_hiddens,
 ```
 
 ```{.python .input}
-%%tab mxnet, pytorch
+%%tab mxnet
 batch_size, num_queries, num_kvpairs = 2, 4, 6
 valid_lens = d2l.tensor([3, 2])
 X = d2l.ones((batch_size, num_queries, num_hiddens))

diff --git a/chapter_attention-mechanisms-and-transformers/queries-keys-values.md b/chapter_attention-mechanisms-and-transformers/queries-keys-values.md
@@ -45,10 +45,6 @@ where weights are derived according to the compatibility between a query $\mathb
 
 What is quite remarkable is that the actual "code" to execute on the set of keys and values, namely the query, can be quite concise, even though the space to operate on is significant. This is a desirable property for a network layer as it does not require too many parameters to learn. Just as convenient is the fact that attention can operate on arbitrarily large databases without the need to change the way the attention pooling operation is performed. 
 
-## Visualization
-
-One of the benefits of the attention mechanism is that it can be quite intuitive, particularly when the weights are nonnegative and sum to $1$. In this case we might *interpret* large weights as a way for the model to select components of relevance. While this is a good intuition, it is important to remember that it is just that, an *intuition*. Regardless, we may want to visualize its effect on the given set of keys, when applying a variety of different queries. This function will come in handy later.
-
 ```{.python .input}
 %%tab mxnet
 from d2l import mxnet as d2l
@@ -74,6 +70,10 @@ from d2l import jax as d2l
 from jax import numpy as jnp
 ```
 
+## Visualization
+
+One of the benefits of the attention mechanism is that it can be quite intuitive, particularly when the weights are nonnegative and sum to $1$. In this case we might *interpret* large weights as a way for the model to select components of relevance. While this is a good intuition, it is important to remember that it is just that, an *intuition*. Regardless, we may want to visualize its effect on the given set of keys, when applying a variety of different queries. This function will come in handy later.
+
 We thus define the `show_heatmaps` function. Note that it does not take a matrix (of attention weights) as its input but rather a tensor with 4 axes, allowing for an array of different queries and weights. Consequently the input `matrices` has the shape (number of rows for display, number of columns for display, number of queries, number of keys). This will come in handy later on when we want to visualize the workings of :numref:`sec_multihead-attention` that is used to design Transformers.
 
 ```{.python .input  n=17}

diff --git a/...attention-mechanisms-and-transformers/self-attention-and-positional-encoding.md b/...attention-mechanisms-and-transformers/self-attention-and-positional-encoding.md
@@ -77,6 +77,16 @@ computes the self-attention of a tensor
 with shape (batch size, number of time steps or sequence length in tokens, $d$).
 The output tensor has the same shape.
 
+```{.python .input}
+%%tab pytorch
+num_hiddens, num_heads = 100, 5
+attention = d2l.MultiHeadAttention(num_hiddens, num_heads, 0.5)
+batch_size, num_queries, valid_lens = 2, 4, d2l.tensor([3, 2])
+X = d2l.ones((batch_size, num_queries, num_hiddens))
+d2l.check_shape(attention(X, X, X, valid_lens),
+                (batch_size, num_queries, num_hiddens))
+```
+
 ```{.python .input}
 %%tab mxnet
 num_hiddens, num_heads = 100, 5
@@ -85,7 +95,7 @@ attention.initialize()
 ```
 
 ```{.python .input}
-%%tab pytorch, jax
+%%tab jax
 num_hiddens, num_heads = 100, 5
 attention = d2l.MultiHeadAttention(num_hiddens, num_heads, 0.5)
 ```
@@ -98,7 +108,7 @@ attention = d2l.MultiHeadAttention(num_hiddens, num_hiddens, num_hiddens,
 ```
 
 ```{.python .input}
-%%tab mxnet, pytorch
+%%tab mxnet
 batch_size, num_queries, valid_lens = 2, 4, d2l.tensor([3, 2])
 X = d2l.ones((batch_size, num_queries, num_hiddens))
 d2l.check_shape(attention(X, X, X, valid_lens),

diff --git a/chapter_attention-mechanisms-and-transformers/transformer.md b/chapter_attention-mechanisms-and-transformers/transformer.md
@@ -1233,7 +1233,8 @@ is (number of encoder layers, number of attention heads, `num_steps` or number o
 %%tab pytorch, mxnet, tensorflow
 _, dec_attention_weights = model.predict_step(
     data.build([engs[-1]], [fras[-1]]), d2l.try_gpu(), data.num_steps, True)
-enc_attention_weights = d2l.reshape(d2l.concat(model.encoder.attention_weights, 0),
+enc_attention_weights = d2l.concat(model.encoder.attention_weights, 0)
+enc_attention_weights = d2l.reshape(enc_attention_weights,
                                     (num_blks, num_heads, -1, data.num_steps))
 d2l.check_shape(enc_attention_weights,
                 (num_blks, num_heads, data.num_steps, data.num_steps))
@@ -1244,7 +1245,8 @@ d2l.check_shape(enc_attention_weights,
 _, (dec_attention_weights, enc_attention_weights) = model.predict_step(
     trainer.state.params, data.build([engs[-1]], [fras[-1]]),
     data.num_steps, True)
-enc_attention_weights = d2l.reshape(d2l.concat(enc_attention_weights, 0),
+enc_attention_weights = d2l.concat(enc_attention_weights, 0)
+enc_attention_weights = d2l.reshape(enc_attention_weights,
                                     (num_blks, num_heads, -1, data.num_steps))
 d2l.check_shape(enc_attention_weights,
                 (num_blks, num_heads, data.num_steps, data.num_steps))
@@ -1308,8 +1310,8 @@ dec_attention_weights_2d = [head[0].tolist()
                             for attn in step for blk in attn for head in blk]
 dec_attention_weights_filled = d2l.tensor(
     pd.DataFrame(dec_attention_weights_2d).fillna(0.0).values)
-dec_attention_weights = d2l.reshape(dec_attention_weights_filled, (
-    -1, 2, num_blks, num_heads, data.num_steps))
+shape = (-1, 2, num_blks, num_heads, data.num_steps)
+dec_attention_weights = d2l.reshape(dec_attention_weights_filled, shape)
 dec_self_attention_weights, dec_inter_attention_weights = \
     dec_attention_weights.permute(1, 2, 3, 0, 4)
 ```

diff --git a/chapter_builders-guide/init-param.md b/chapter_builders-guide/init-param.md
@@ -115,6 +115,7 @@ def init_normal(module):
     if type(module) == nn.Linear:
         nn.init.normal_(module.weight, mean=0, std=0.01)
         nn.init.zeros_(module.bias)
+
 net.apply(init_normal)
 net[0].weight.data[0], net[0].bias.data[0]
 ```
@@ -162,6 +163,7 @@ def init_constant(module):
     if type(module) == nn.Linear:
         nn.init.constant_(module.weight, 1)
         nn.init.zeros_(module.bias)
+
 net.apply(init_constant)
 net[0].weight.data[0], net[0].bias.data[0]
 ```
@@ -213,6 +215,7 @@ print(net[1].weight.data())
 def init_xavier(module):
     if type(module) == nn.Linear:
         nn.init.xavier_uniform_(module.weight)
+
 def init_42(module):
     if type(module) == nn.Linear:
         nn.init.constant_(module.weight, 42)

diff --git a/chapter_builders-guide/use-gpu.md b/chapter_builders-guide/use-gpu.md
@@ -153,13 +153,24 @@ from jax import numpy as jnp
 ```
 
 ```{.python .input}
-%%tab all
+%%tab pytorch
+def cpu():  #@save
+    """Get the CPU device."""
+    return torch.device('cpu')
+
+def gpu(i=0):  #@save
+    """Get a GPU device."""
+    return torch.device(f'cuda:{i}')
+
+cpu(), gpu(), gpu(1)
+```
+
+```{.python .input}
+%%tab mxnet, tensorflow, jax
 def cpu():  #@save
     """Get the CPU device."""
     if tab.selected('mxnet'):
         return npx.cpu()
-    if tab.selected('pytorch'):
-        return torch.device('cpu')
     if tab.selected('tensorflow'):
         return tf.device('/CPU:0')
     if tab.selected('jax'):
@@ -169,8 +180,6 @@ def gpu(i=0):  #@save
     """Get a GPU device."""
     if tab.selected('mxnet'):
         return npx.gpu(i)
-    if tab.selected('pytorch'):
-        return torch.device(f'cuda:{i}')
     if tab.selected('tensorflow'):
         return tf.device(f'/GPU:{i}')
     if tab.selected('jax'):
@@ -182,13 +191,20 @@ cpu(), gpu(), gpu(1)
 We can (**query the number of available GPUs.**)
 
 ```{.python .input}
-%%tab all
+%%tab pytorch
+def num_gpus():  #@save
+    """Get the number of available GPUs."""
+    return torch.cuda.device_count()
+
+num_gpus()
+```
+
+```{.python .input}
+%%tab mxnet, tensorflow, jax
 def num_gpus():  #@save
     """Get the number of available GPUs."""
     if tab.selected('mxnet'):
         return npx.num_gpus()
-    if tab.selected('pytorch'):
-        return torch.cuda.device_count()
     if tab.selected('tensorflow'):
         return len(tf.config.experimental.list_physical_devices('GPU'))
     if tab.selected('jax'):

diff --git a/chapter_convolutional-neural-networks/padding-and-strides.md b/chapter_convolutional-neural-networks/padding-and-strides.md
@@ -144,6 +144,7 @@ def comp_conv2d(conv2d, X):
     Y = conv2d(X)
     # Strip the first two dimensions: examples and channels
     return Y.reshape(Y.shape[2:])
+
 # 1 row and column is padded on either side, so a total of 2 rows or columns
 # are added
 conv2d = nn.LazyConv2d(1, kernel_size=3, padding=1)

diff --git a/chapter_linear-classification/softmax-regression-concise.md b/chapter_linear-classification/softmax-regression-concise.md
@@ -80,7 +80,21 @@ the dataclass.
 :end_tab:
 
 ```{.python .input}
-%%tab pytorch, mxnet, tensorflow
+%%tab pytorch
+class SoftmaxRegression(d2l.Classifier):  #@save
+    """The softmax regression model."""
+    def __init__(self, num_outputs, lr):
+        super().__init__()
+        self.save_hyperparameters()
+        self.net = nn.Sequential(nn.Flatten(),
+                                 nn.LazyLinear(num_outputs))
+
+    def forward(self, X):
+        return self.net(X)
+```
+
+```{.python .input}
+%%tab mxnet, tensorflow
 class SoftmaxRegression(d2l.Classifier):  #@save
     """The softmax regression model."""
     def __init__(self, num_outputs, lr):
@@ -89,9 +103,6 @@ class SoftmaxRegression(d2l.Classifier):  #@save
         if tab.selected('mxnet'):
             self.net = nn.Dense(num_outputs)
             self.net.initialize()
-        if tab.selected('pytorch'):
-            self.net = nn.Sequential(nn.Flatten(),
-                                     nn.LazyLinear(num_outputs))
         if tab.selected('tensorflow'):
             self.net = tf.keras.models.Sequential()
             self.net.add(tf.keras.layers.Flatten())
@@ -207,7 +218,7 @@ def loss(self, params, X, Y, state, averaged=True):
 
 ## Training
 
-Next we train our model. As before, we use Fashion-MNIST images, flattened to 784-dimensional feature vectors.
+Next we train our model. We use Fashion-MNIST images, flattened to 784-dimensional feature vectors.
 
 ```{.python .input}
 %%tab all

diff --git a/chapter_linear-classification/softmax-regression-scratch.md b/chapter_linear-classification/softmax-regression-scratch.md
@@ -205,8 +205,8 @@ before passing the data through our model.
 %%tab all
 @d2l.add_to_class(SoftmaxRegressionScratch)
 def forward(self, X):
-    return softmax(d2l.matmul(d2l.reshape(
-        X, (-1, self.W.shape[0])), self.W) + self.b)
+    X = d2l.reshape(X, (-1, self.W.shape[0]))
+    return softmax(d2l.matmul(X, self.W) + self.b)
 ```
 
 ## The Cross-Entropy Loss

diff --git a/chapter_linear-regression/oo-design.md b/chapter_linear-regression/oo-design.md
@@ -182,10 +182,64 @@ using type annotations. All Flax modules are Python 3.7 dataclasses.
 :end_tab:
 
 ```{.python .input}
-%%tab all
+%%tab pytorch
 class Module(d2l.nn_Module, d2l.HyperParameters):  #@save
     """The base class of models."""
-    if tab.selected('pytorch', 'mxnet', 'tensorflow'):
+    def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1):
+        super().__init__()
+        self.save_hyperparameters()
+        self.board = ProgressBoard()
+
+    def loss(self, y_hat, y):
+        raise NotImplementedError
+
+    def forward(self, X):
+        assert hasattr(self, 'net'), 'Neural network is defined'
+        return self.net(X)
+
+    def plot(self, key, value, train):
+        """Plot a point in animation."""
+        assert hasattr(self, 'trainer'), 'Trainer is not inited'
+        self.board.xlabel = 'epoch'
+        if train:
+            x = self.trainer.train_batch_idx / \
+                self.trainer.num_train_batches
+            n = self.trainer.num_train_batches / \
+                self.plot_train_per_epoch
+        else:
+            x = self.trainer.epoch + 1
+            n = self.trainer.num_val_batches / \
+                self.plot_valid_per_epoch
+        if tab.selected('mxnet', 'tensorflow'):
+            self.board.draw(x, d2l.numpy(value), (
+                'train_' if train else 'val_') + key, every_n=int(n))
+        if tab.selected('pytorch'):
+            self.board.draw(x, d2l.numpy(d2l.to(value, d2l.cpu())),
+                            ('train_' if train else 'val_') + key,
+                            every_n=int(n))
+        if tab.selected('jax'):
+            self.board.draw(x, d2l.to(value, d2l.cpu()),
+                            ('train_' if train else 'val_') + key,
+                            every_n=int(n))
+
+    def training_step(self, batch):
+        l = self.loss(self(*batch[:-1]), batch[-1])
+        self.plot('loss', l, train=True)
+        return l
+
+    def validation_step(self, batch):
+        l = self.loss(self(*batch[:-1]), batch[-1])
+        self.plot('loss', l, train=False)
+
+    def configure_optimizers(self):
+        raise NotImplementedError
+```
+
+```{.python .input}
+%%tab mxnet, tensorflow, jax
+class Module(d2l.nn_Module, d2l.HyperParameters):  #@save
+    """The base class of models."""
+    if tab.selected('mxnet', 'tensorflow'):
         def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1):
             super().__init__()
             self.save_hyperparameters()
@@ -204,7 +258,7 @@ class Module(d2l.nn_Module, d2l.HyperParameters):  #@save
     def loss(self, y_hat, y):
         raise NotImplementedError
 
-    if tab.selected('pytorch', 'mxnet', 'tensorflow'):
+    if tab.selected('mxnet', 'tensorflow'):
         def forward(self, X):
             assert hasattr(self, 'net'), 'Neural network is defined'
             return self.net(X)
@@ -242,16 +296,12 @@ class Module(d2l.nn_Module, d2l.HyperParameters):  #@save
         if tab.selected('mxnet', 'tensorflow'):
             self.board.draw(x, d2l.numpy(value), (
                 'train_' if train else 'val_') + key, every_n=int(n))
-        if tab.selected('pytorch'):
-            self.board.draw(x, d2l.numpy(d2l.to(value, d2l.cpu())),
-                            ('train_' if train else 'val_') + key,
-                            every_n=int(n))
         if tab.selected('jax'):
             self.board.draw(x, d2l.to(value, d2l.cpu()),
                             ('train_' if train else 'val_') + key,
                             every_n=int(n))
 
-    if tab.selected('pytorch', 'mxnet', 'tensorflow'):
+    if tab.selected('mxnet', 'tensorflow'):
         def training_step(self, batch):
             l = self.loss(self(*batch[:-1]), batch[-1])
             self.plot('loss', l, train=True)