Remove accents and period from code comments

naviat · Feb 3, 2023 · 6d7b666 · 6d7b666
1 parent 86bb603
commit 6d7b666
Show file tree

Hide file tree

Showing 22 changed files with 71 additions and 64 deletions.
diff --git a/chapter_attention-mechanisms-and-transformers/attention-pooling.md b/chapter_attention-mechanisms-and-transformers/attention-pooling.md
@@ -144,7 +144,8 @@ def nadaraya_watson(x_train, y_train, x_val, kernel):
     dists = d2l.reshape(x_train, (-1, 1)) - d2l.reshape(x_val, (1, -1))
     # Each column/row corresponds to each query/key
     k = d2l.astype(kernel(dists), d2l.float32)
-    attention_w = k / d2l.reduce_sum(k, 0)  # Normalization over keys for each query
+    # Normalization over keys for each query
+    attention_w = k / d2l.reduce_sum(k, 0)
     if tab.selected('pytorch'):
         y_hat = y_train@attention_w
     if tab.selected('mxnet'):

diff --git a/chapter_attention-mechanisms-and-transformers/attention-scoring-functions.md b/chapter_attention-mechanisms-and-transformers/attention-scoring-functions.md
@@ -178,7 +178,7 @@ def masked_softmax(X, valid_lens):  #@save
 %%tab jax
 def masked_softmax(X, valid_lens):  #@save
     """Perform softmax operation by masking elements on the last axis."""
-    # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor
+    # X: 3D tensor, valid_lens: 1D or 2D tensor
     def _sequence_mask(X, valid_len, value=0):
         maxlen = X.shape[1]
         mask = jnp.arange((maxlen),

diff --git a/chapter_builders-guide/init-param.md b/chapter_builders-guide/init-param.md
@@ -103,7 +103,7 @@ with standard deviation 0.01, while bias parameters cleared to zero.
 
 ```{.python .input}
 %%tab mxnet
-# Here `force_reinit` ensures that parameters are freshly initialized even if
+# Here force_reinit ensures that parameters are freshly initialized even if
 # they were already initialized previously
 net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
 net[0].weight.data()[0]

diff --git a/chapter_builders-guide/model-construction.md b/chapter_builders-guide/model-construction.md
@@ -537,15 +537,15 @@ So we implement a `FixedHiddenMLP` class as follows.
 class FixedHiddenMLP(nn.Block):
     def __init__(self):
         super().__init__()
-        # Random weight parameters created with the `get_constant` method
+        # Random weight parameters created with the get_constant method
         # are not updated during training (i.e., constant parameters)
         self.rand_weight = self.params.get_constant(
             'rand_weight', np.random.uniform(size=(20, 20)))
         self.dense = nn.Dense(20, activation='relu')
 
     def forward(self, X):
         X = self.dense(X)
-        # Use the created constant parameters, as well as the `relu` and `dot`
+        # Use the created constant parameters, as well as the relu and dot
         # functions
         X = npx.relu(np.dot(X, self.rand_weight.data()) + 1)
         # Reuse the fully connected layer. This is equivalent to sharing
@@ -585,15 +585,15 @@ class FixedHiddenMLP(tf.keras.Model):
     def __init__(self):
         super().__init__()
         self.flatten = tf.keras.layers.Flatten()
-        # Random weight parameters created with `tf.constant` are not updated
+        # Random weight parameters created with tf.constant are not updated
         # during training (i.e., constant parameters)
         self.rand_weight = tf.constant(tf.random.uniform((20, 20)))
         self.dense = tf.keras.layers.Dense(20, activation=tf.nn.relu)
 
     def call(self, inputs):
         X = self.flatten(inputs)
-        # Use the created constant parameters, as well as the `relu` and
-        # `matmul` functions
+        # Use the created constant parameters, as well as the relu and
+        # matmul functions
         X = tf.nn.relu(tf.matmul(X, self.rand_weight) + 1)
         # Reuse the fully connected layer. This is equivalent to sharing
         # parameters with two fully connected layers

diff --git a/chapter_builders-guide/use-gpu.md b/chapter_builders-guide/use-gpu.md
@@ -304,7 +304,7 @@ X
 
 ```{.python .input}
 %%tab jax
-# By default jax puts arrays to GPUs or TPUs if available
+# By default JAX puts arrays to GPUs or TPUs if available
 X = jax.device_put(jnp.ones((2, 3)), try_gpu())
 X
 ```

diff --git a/chapter_convolutional-modern/batch-norm.md b/chapter_convolutional-modern/batch-norm.md
@@ -250,7 +250,7 @@ from mxnet.gluon import nn
 npx.set_np()
 
 def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
-    # Use `autograd` to determine whether we are in training mode
+    # Use autograd to determine whether we are in training mode
     if not autograd.is_training():
         # In prediction mode, use mean and variance obtained by moving average
         X_hat = (X - moving_mean) / np.sqrt(moving_var + eps)
@@ -264,7 +264,7 @@ def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
         else:
             # When using a two-dimensional convolutional layer, calculate the
             # mean and variance on the channel dimension (axis=1). Here we
-            # need to maintain the shape of `X`, so that the broadcasting
+            # need to maintain the shape of X, so that the broadcasting
             # operation can be carried out later
             mean = X.mean(axis=(0, 2, 3), keepdims=True)
             var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
@@ -284,7 +284,7 @@ import torch
 from torch import nn
 
 def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
-    # Use `is_grad_enabled` to determine whether we are in training mode
+    # Use is_grad_enabled to determine whether we are in training mode
     if not torch.is_grad_enabled():
         # In prediction mode, use mean and variance obtained by moving average
         X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
@@ -298,7 +298,7 @@ def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
         else:
             # When using a two-dimensional convolutional layer, calculate the
             # mean and variance on the channel dimension (axis=1). Here we
-            # need to maintain the shape of `X`, so that the broadcasting
+            # need to maintain the shape of X, so that the broadcasting
             # operation can be carried out later
             mean = X.mean(dim=(0, 2, 3), keepdim=True)
             var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
@@ -425,9 +425,9 @@ class BatchNorm(nn.Block):
 ```{.python .input}
 %%tab pytorch
 class BatchNorm(nn.Module):
-    # `num_features`: the number of outputs for a fully connected layer
-    # or the number of output channels for a convolutional layer. `num_dims`:
-    # 2 for a fully connected layer and 4 for a convolutional layer
+    # num_features: the number of outputs for a fully connected layer or the
+    # number of output channels for a convolutional layer. num_dims: 2 for a
+    # fully connected layer and 4 for a convolutional layer
     def __init__(self, num_features, num_dims):
         super().__init__()
         if num_dims == 2:
@@ -444,12 +444,12 @@ class BatchNorm(nn.Module):
         self.moving_var = torch.ones(shape)
 
     def forward(self, X):
-        # If `X` is not on the main memory, copy `moving_mean` and
-        # `moving_var` to the device where `X` is located
+        # If X is not on the main memory, copy moving_mean and moving_var to
+        # the device where X is located
         if self.moving_mean.device != X.device:
             self.moving_mean = self.moving_mean.to(X.device)
             self.moving_var = self.moving_var.to(X.device)
-        # Save the updated `moving_mean` and `moving_var`
+        # Save the updated moving_mean and moving_var
         Y, self.moving_mean, self.moving_var = batch_norm(
             X, self.gamma, self.beta, self.moving_mean,
             self.moving_var, eps=1e-5, momentum=0.1)

diff --git a/chapter_convolutional-modern/googlenet.md b/chapter_convolutional-modern/googlenet.md
@@ -49,7 +49,7 @@ from mxnet.gluon import nn
 npx.set_np()
 
 class Inception(nn.Block):
-    # `c1`--`c4` are the number of output channels for each branch
+    # c1--c4 are the number of output channels for each branch
     def __init__(self, c1, c2, c3, c4, **kwargs):
         super(Inception, self).__init__(**kwargs)
         # Branch 1
@@ -82,7 +82,7 @@ from torch import nn
 from torch.nn import functional as F
 
 class Inception(nn.Module):
-    # `c1`--`c4` are the number of output channels for each branch
+    # c1--c4 are the number of output channels for each branch
     def __init__(self, c1, c2, c3, c4, **kwargs):
         super(Inception, self).__init__(**kwargs)
         # Branch 1
@@ -111,7 +111,7 @@ import tensorflow as tf
 from d2l import tensorflow as d2l
 
 class Inception(tf.keras.Model):
-    # `c1`--`c4` are the number of output channels for each branch
+    # c1--c4 are the number of output channels for each branch
     def __init__(self, c1, c2, c3, c4):
         super().__init__()
         self.b1_1 = tf.keras.layers.Conv2D(c1, 1, activation='relu')

diff --git a/chapter_convolutional-neural-networks/channels.md b/chapter_convolutional-neural-networks/channels.md
@@ -154,8 +154,8 @@ to [**calculate the output of multiple channels**] as shown below.
 ```{.python .input}
 %%tab all
 def corr2d_multi_in_out(X, K):
-    # Iterate through the 0th dimension of `K`, and each time, perform
-    # cross-correlation operations with input `X`. All of the results are
+    # Iterate through the 0th dimension of K, and each time, perform
+    # cross-correlation operations with input X. All of the results are
     # stacked together
     return d2l.stack([corr2d_multi_in(X, k) for k in K], 0)
 ```

diff --git a/chapter_convolutional-neural-networks/padding-and-strides.md b/chapter_convolutional-neural-networks/padding-and-strides.md
@@ -115,7 +115,7 @@ npx.set_np()
 
 # We define a helper function to calculate convolutions. It initializes 
 # the convolutional layer weights and performs corresponding dimensionality 
-# elevations and reductions on the input and output.
+# elevations and reductions on the input and output
 def comp_conv2d(conv2d, X):
     conv2d.initialize()
     # (1, 1) indicates that batch size and the number of channels are both 1
@@ -135,16 +135,17 @@ comp_conv2d(conv2d, X).shape
 import torch
 from torch import nn
 
-# We define a helper function to calculate convolutions. It initializes
-# the convolutional layer weights and performs corresponding dimensionality
+# We define a helper function to calculate convolutions. It initializes the
+# convolutional layer weights and performs corresponding dimensionality
 # elevations and reductions on the input and output
 def comp_conv2d(conv2d, X):
     # (1, 1) indicates that batch size and the number of channels are both 1
     X = X.reshape((1, 1) + X.shape)
     Y = conv2d(X)
     # Strip the first two dimensions: examples and channels
     return Y.reshape(Y.shape[2:])
-# 1 row and column is padded on either side, so a total of 2 rows or columns are added
+# 1 row and column is padded on either side, so a total of 2 rows or columns
+# are added
 conv2d = nn.LazyConv2d(1, kernel_size=3, padding=1)
 X = torch.rand(size=(8, 8))
 comp_conv2d(conv2d, X).shape
@@ -163,7 +164,8 @@ def comp_conv2d(conv2d, X):
     Y = conv2d(X)
     # Strip the first two dimensions: examples and channels
     return tf.reshape(Y, Y.shape[1:3])
-# 1 row and column is padded on either side, so a total of 2 rows or columns are added
+# 1 row and column is padded on either side, so a total of 2 rows or columns
+# are added
 conv2d = tf.keras.layers.Conv2D(1, kernel_size=3, padding='same')
 X = tf.random.uniform(shape=(8, 8))
 comp_conv2d(conv2d, X).shape
@@ -206,8 +208,8 @@ comp_conv2d(conv2d, X).shape
 
 ```{.python .input}
 %%tab pytorch
-# We use a convolution kernel with height 5 and width 3. The padding on
-# either side of the height and width are 2 and 1, respectively
+# We use a convolution kernel with height 5 and width 3. The padding on either
+# side of the height and width are 2 and 1, respectively
 conv2d = nn.LazyConv2d(1, kernel_size=(5, 3), padding=(2, 1))
 comp_conv2d(conv2d, X).shape
 ```

diff --git a/chapter_convolutional-neural-networks/pooling.md b/chapter_convolutional-neural-networks/pooling.md
@@ -330,7 +330,8 @@ X
 
 ```{.python .input}
 %%tab tensorflow, jax
-X = d2l.concat([X, X + 1], 3)  # Concatenate along `dim=3` due to channels-last syntax
+# Concatenate along `dim=3` due to channels-last syntax
+X = d2l.concat([X, X + 1], 3)
 X
 ```
 

diff --git a/chapter_linear-classification/softmax-regression-concise.md b/chapter_linear-classification/softmax-regression-concise.md
@@ -109,7 +109,7 @@ class SoftmaxRegression(d2l.Classifier):  #@save
 
     @nn.compact
     def __call__(self, X):
-        X = X.reshape((X.shape[0], -1))  # flatten
+        X = X.reshape((X.shape[0], -1))  # Flatten
         X = nn.Dense(self.num_outputs)(X)
         return X
 ```
@@ -194,8 +194,9 @@ def loss(self, Y_hat, Y, averaged=True):
 @d2l.add_to_class(d2l.Classifier)  #@save
 @partial(jax.jit, static_argnums=(0, 5))
 def loss(self, params, X, Y, state, averaged=True):
+    # To be used later (e.g., for batch norm)
     Y_hat = state.apply_fn({'params': params}, *X,
-                           mutable=False, rngs=None)  # To be used later (e.g., for batch norm)
+                           mutable=False, rngs=None)
     Y_hat = d2l.reshape(Y_hat, (-1, Y_hat.shape[-1]))
     Y = d2l.reshape(Y, (-1,))
     fn = optax.softmax_cross_entropy_with_integer_labels

diff --git a/chapter_linear-regression/weight-decay.md b/chapter_linear-regression/weight-decay.md
@@ -450,7 +450,7 @@ class WeightDecay(d2l.LinearRegression):
     wd: int = 0
     
     def configure_optimizers(self):
-        # Weight Decay is not available directly within `optax.sgd`, but
+        # Weight Decay is not available directly within optax.sgd, but
         # optax allows chaining several transformations together
         return optax.chain(optax.additive_weight_decay(self.wd),
                            optax.sgd(self.lr))

diff --git a/chapter_multilayer-perceptrons/kaggle-house-price.md b/chapter_multilayer-perceptrons/kaggle-house-price.md
@@ -246,12 +246,12 @@ def preprocess(self):
         (self.raw_train.drop(columns=['Id', label]),
          self.raw_val.drop(columns=['Id'])))
     # Standardize numerical columns
-    numeric_features = features.dtypes[features.dtypes != 'object'].index
+    numeric_features = features.dtypes[features.dtypes!='object'].index
     features[numeric_features] = features[numeric_features].apply(
         lambda x: (x - x.mean()) / (x.std()))
     # Replace NAN numerical features by 0
     features[numeric_features] = features[numeric_features].fillna(0)
-    # Replace discrete features by one-hot encoding.
+    # Replace discrete features by one-hot encoding
     features = pd.get_dummies(features, dummy_na=True)
     # Save preprocessed features
     self.train = features[:self.raw_train.shape[0]].copy()

diff --git a/chapter_preliminaries/autograd.md b/chapter_preliminaries/autograd.md
@@ -128,7 +128,8 @@ x = tf.Variable(x)
 
 ```{.python .input  n=10}
 %%tab mxnet
-# Our code is inside an `autograd.record` scope to build the computational graph
+# Our code is inside an `autograd.record` scope to build the computational
+# graph
 with autograd.record():
     y = 2 * np.dot(x, x)
 y
@@ -203,7 +204,7 @@ x_grad
 ```{.python .input}
 %%tab jax
 from jax import grad
-# the `grad` transform returns a Python function that
+# The `grad` transform returns a Python function that
 # computes the gradient of the original function
 x_grad = grad(y)(x)
 x_grad
@@ -376,13 +377,13 @@ x.grad
 %%tab tensorflow
 with tf.GradientTape() as t:
     y = x * x
-t.gradient(y, x)  # Same as `y = tf.reduce_sum(x * x)`
+t.gradient(y, x)  # Same as y = tf.reduce_sum(x * x)
 ```
 
 ```{.python .input}
 %%tab jax
 y = lambda x: x * x
-# `grad` is only defined for scalar output functions
+# grad is only defined for scalar output functions
 grad(lambda x: y(x).sum())(x)
 ```
 
@@ -434,8 +435,8 @@ x.grad == u
 
 ```{.python .input}
 %%tab tensorflow
-# Set `persistent=True` to preserve the compute graph. 
-# This lets us run `t.gradient` more than once
+# Set persistent=True to preserve the compute graph. 
+# This lets us run t.gradient more than once
 with tf.GradientTape(persistent=True) as t:
     y = x * x
     u = tf.stop_gradient(y)
@@ -450,7 +451,7 @@ x_grad == u
 import jax
 
 y = lambda x: x * x
-# `jax.lax` primitives are Python wrappers around XLA operations
+# jax.lax primitives are Python wrappers around XLA operations
 u = jax.lax.stop_gradient(y(x))
 z = lambda x: u * x
 

diff --git a/chapter_preliminaries/calculus.md b/chapter_preliminaries/calculus.md
@@ -244,7 +244,7 @@ def plot(X, Y=None, xlabel=None, ylabel=None, legend=[], xlim=None,
          fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
     """Plot data points."""
 
-    def has_one_axis(X):  # True if `X` (tensor or list) has 1 axis
+    def has_one_axis(X):  # True if X (tensor or list) has 1 axis
         return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
                 and not hasattr(X[0], "__len__"))
     

diff --git a/chapter_preliminaries/linear-algebra.md b/chapter_preliminaries/linear-algebra.md
@@ -383,21 +383,21 @@ same shape as their operands.
 ```{.python .input}
 %%tab mxnet
 A = np.arange(6).reshape(2, 3)
-B = A.copy()  # Assign a copy of `A` to `B` by allocating new memory
+B = A.copy()  # Assign a copy of A to B by allocating new memory
 A, A + B
 ```
 
 ```{.python .input}
 %%tab pytorch
 A = torch.arange(6, dtype=torch.float32).reshape(2, 3)
-B = A.clone()  # Assign a copy of `A` to `B` by allocating new memory
+B = A.clone()  # Assign a copy of A to B by allocating new memory
 A, A + B
 ```
 
 ```{.python .input}
 %%tab tensorflow
 A = tf.reshape(tf.range(6, dtype=tf.float32), (2, 3))
-B = A  # No cloning of `A` to `B` by allocating new memory
+B = A  # No cloning of A to B by allocating new memory
 A, A + B
 ```
 
@@ -548,12 +548,12 @@ is equivalent to summing up all the elements of the matrix.
 
 ```{.python .input}
 %%tab mxnet, pytorch, jax
-A.sum(axis=[0, 1]) == A.sum() # Same as `A.sum()`
+A.sum(axis=[0, 1]) == A.sum() # Same as A.sum()
 ```
 
 ```{.python .input}
 %%tab tensorflow
-tf.reduce_sum(A, axis=[0, 1]), tf.reduce_sum(A) # Same as `tf.reduce_sum(A)`
+tf.reduce_sum(A, axis=[0, 1]), tf.reduce_sum(A) # Same as tf.reduce_sum(A)
 ```
 
 [**A related quantity is the *mean*, also called the *average*.**]

diff --git a/chapter_preliminaries/ndarray.md b/chapter_preliminaries/ndarray.md
@@ -406,7 +406,7 @@ X_var
 
 ```{.python .input}
 %%tab jax
-# JAX arrays are immutable. `jax.numpy.ndarray.at` index
+# JAX arrays are immutable. jax.numpy.ndarray.at index
 # update operators create a new array with the corresponding
 # modifications made
 X_new_1 = X.at[1, 2].set(17)