Remove generalized code for swin transformer

naviat · Feb 9, 2023 · 2fbc422 · 2fbc422
1 parent c302295
commit 2fbc422
Show file tree

Hide file tree

Showing 18 changed files with 48 additions and 164 deletions.
diff --git a/chapter_appendix-tools-for-deep-learning/jupyter.md b/chapter_appendix-tools-for-deep-learning/jupyter.md
@@ -106,7 +106,7 @@ jupyter notebook --generate-config
 ```
 
 
-Then, add the following line to the end of the Jupyter Notebook configuration file (for Linux/macOS, usually in the path `~/.jupyter/jupyter_notebook_config.py`):
+Then, add the following line to the end of the Jupyter Notebook configuration file (for Linux or macOS, usually in the path `~/.jupyter/jupyter_notebook_config.py`):
 
 ```
 c.NotebookApp.contents_manager_class = 'notedown.NotedownContentsManager'

diff --git a/chapter_attention-mechanisms-and-transformers/attention-scoring-functions.md b/chapter_attention-mechanisms-and-transformers/attention-scoring-functions.md
@@ -93,7 +93,7 @@ Hello world <blank> <blank>
 ```
 
 
-Since we do not want blanks in our attention model we simply need to limit the sum $\sum_{i=1}^n \alpha(\mathbf{q}, \mathbf{k}_i) \mathbf{v}_i$ to $\sum_{i=1}^l \alpha(\mathbf{q}, \mathbf{k}_i) \mathbf{v}_i$ for however long $l \leq n$ the actual sentence is. Since it is such a common problem, it has a name: the *masked softmax operation*. 
+Since we do not want blanks in our attention model we simply need to limit $\sum_{i=1}^n \alpha(\mathbf{q}, \mathbf{k}_i) \mathbf{v}_i$ to $\sum_{i=1}^l \alpha(\mathbf{q}, \mathbf{k}_i) \mathbf{v}_i$ for however long $l \leq n$ the actual sentence is. Since it is such a common problem, it has a name: the *masked softmax operation*. 
 
 Let's implement it. Actually, the implementation cheats ever so slightly by setting the values to zero $\mathbf{v}_i = 0$ for $i > l$. Moreover, it sets the attention weights to a large negative number, such as $-10^{6}$ in order to make their contribution to gradients and values vanish in practice. This is done since linear algebra kernels and operators are heavily optimized for GPUs and it is faster to be slightly wasteful in computation rather than to have code with conditional (if then else) statements.
 
@@ -319,31 +319,18 @@ we use dropout for model regularization.
 %%tab mxnet
 class DotProductAttention(nn.Block):  #@save
     """Scaled dot product attention."""
-    def __init__(self, dropout, num_heads=None):
+    def __init__(self, dropout):
         super().__init__()
         self.dropout = nn.Dropout(dropout)
-        self.num_heads = num_heads  # To be covered later
 
     # Shape of queries: (batch_size, no. of queries, d)
     # Shape of keys: (batch_size, no. of key-value pairs, d)
     # Shape of values: (batch_size, no. of key-value pairs, value dimension)
     # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
-    def forward(self, queries, keys, values, valid_lens=None,
-                window_mask=None):
+    def forward(self, queries, keys, values, valid_lens=None):
         d = queries.shape[-1]
         # Set transpose_b=True to swap the last two dimensions of keys
         scores = npx.batch_dot(queries, keys, transpose_b=True) / math.sqrt(d)
-        if window_mask is not None:  # To be covered later
-            num_windows = window_mask.shape[0]
-            n, num_queries, num_kv_pairs = scores.shape
-            # Shape of window_mask: (num_windows, no. of queries,
-            # no. of key-value pairs)
-            scores = d2l.reshape(
-                scores, (n//(num_windows*self.num_heads), num_windows,
-                         self.num_heads, num_queries, num_kv_pairs
-                        )) + d2l.expand_dims(
-                d2l.expand_dims(window_mask, 1), 0)
-            scores = d2l.reshape(scores, (n, num_queries, num_kv_pairs))
         self.attention_weights = masked_softmax(scores, valid_lens)
         return npx.batch_dot(self.dropout(self.attention_weights), values)
 ```
@@ -352,31 +339,18 @@ class DotProductAttention(nn.Block):  #@save
 %%tab pytorch
 class DotProductAttention(nn.Module):  #@save
     """Scaled dot product attention."""
-    def __init__(self, dropout, num_heads=None):
+    def __init__(self, dropout):
         super().__init__()
         self.dropout = nn.Dropout(dropout)
-        self.num_heads = num_heads  # To be covered later
 
     # Shape of queries: (batch_size, no. of queries, d)
     # Shape of keys: (batch_size, no. of key-value pairs, d)
     # Shape of values: (batch_size, no. of key-value pairs, value dimension)
     # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
-    def forward(self, queries, keys, values, valid_lens=None,
-                window_mask=None):
+    def forward(self, queries, keys, values, valid_lens=None):
         d = queries.shape[-1]
         # Swap the last two dimensions of keys with keys.transpose(1, 2)
         scores = torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(d)
-        if window_mask is not None:  # To be covered later
-            num_windows = window_mask.shape[0]
-            n, num_queries, num_kv_pairs = scores.shape
-            # Shape of window_mask: (num_windows, no. of queries,
-            # no. of key-value pairs)
-            scores = d2l.reshape(
-                scores, (n//(num_windows*self.num_heads), num_windows,
-                         self.num_heads, num_queries, num_kv_pairs
-                        )) + d2l.expand_dims(
-                d2l.expand_dims(window_mask, 1), 0)
-            scores = d2l.reshape(scores, (n, num_queries, num_kv_pairs))
         self.attention_weights = masked_softmax(scores, valid_lens)
         return torch.bmm(self.dropout(self.attention_weights), values)
 ```
@@ -385,31 +359,18 @@ class DotProductAttention(nn.Module):  #@save
 %%tab tensorflow
 class DotProductAttention(tf.keras.layers.Layer):  #@save
     """Scaled dot product attention."""
-    def __init__(self, dropout, num_heads=None):
+    def __init__(self, dropout):
         super().__init__()
         self.dropout = tf.keras.layers.Dropout(dropout)
-        self.num_heads = num_heads  # To be covered later
         
     # Shape of queries: (batch_size, no. of queries, d)
     # Shape of keys: (batch_size, no. of key-value pairs, d)
     # Shape of values: (batch_size, no. of key-value pairs, value dimension)
     # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
-    def call(self, queries, keys, values, valid_lens=None, window_mask=None,
-             **kwargs):
+    def call(self, queries, keys, values, valid_lens=None, **kwargs):
         d = queries.shape[-1]
         scores = tf.matmul(queries, keys, transpose_b=True)/tf.math.sqrt(
             tf.cast(d, dtype=tf.float32))
-        if window_mask is not None:  # To be covered later
-            num_windows = window_mask.shape[0]
-            n, num_queries, num_kv_pairs = scores.shape
-            # Shape of window_mask: (num_windows, no. of queries,
-            # no. of key-value pairs)
-            scores = d2l.reshape(
-                scores, (n//(num_windows*self.num_heads), num_windows,
-                         self.num_heads, num_queries, num_kv_pairs
-                        )) + d2l.expand_dims(
-                d2l.expand_dims(window_mask, 1), 0)
-            scores = d2l.reshape(scores, (n, num_queries, num_kv_pairs))
         self.attention_weights = masked_softmax(scores, valid_lens)
         return tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
 ```
@@ -419,29 +380,17 @@ class DotProductAttention(tf.keras.layers.Layer):  #@save
 class DotProductAttention(nn.Module):  #@save
     """Scaled dot product attention."""
     dropout: float
-    num_heads: None = None  # To be covered later
 
     # Shape of queries: (batch_size, no. of queries, d)
     # Shape of keys: (batch_size, no. of key-value pairs, d)
     # Shape of values: (batch_size, no. of key-value pairs, value dimension)
     # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
     @nn.compact
     def __call__(self, queries, keys, values, valid_lens=None,
-                 window_mask=None, training=False):
+                 training=False):
         d = queries.shape[-1]
         # Swap the last two dimensions of keys with keys.swapaxes(1, 2)
         scores = queries@(keys.swapaxes(1, 2)) / math.sqrt(d)
-        if window_mask is not None:  # To be covered later
-            num_windows = window_mask.shape[0]
-            n, num_queries, num_kv_pairs = scores.shape
-            # Shape of window_mask: (num_windows, no. of queries,
-            # no. of key-value pairs)
-            scores = d2l.reshape(
-                scores, (n//(num_windows*self.num_heads), num_windows,
-                         self.num_heads, num_queries, num_kv_pairs
-                        )) + d2l.expand_dims(
-                d2l.expand_dims(window_mask, 1), 0)
-            scores = d2l.reshape(scores, (n, num_queries, num_kv_pairs))
         attention_weights = masked_softmax(scores, valid_lens)
         dropout_layer = nn.Dropout(self.dropout, deterministic=not training)
         return dropout_layer(attention_weights)@values, attention_weights

diff --git a/chapter_attention-mechanisms-and-transformers/bahdanau-attention.md b/chapter_attention-mechanisms-and-transformers/bahdanau-attention.md
@@ -466,6 +466,7 @@ d2l.show_heatmaps(attention_weights[:, :, :, :len(engs[-1].split()) + 1],
 When predicting a token, if not all the input tokens are relevant, the RNN encoder-decoder with the Bahdanau attention mechanism selectively aggregates different parts of the input sequence. This is achieved by treating the state (context variable) as an output of additive attention pooling.
 In the RNN encoder-decoder, the Bahdanau attention mechanism treats the decoder hidden state at the previous time step as the query, and the encoder hidden states at all the time steps as both the keys and values.
 
+
 ## Exercises
 
 1. Replace GRU with LSTM in the experiment.

diff --git a/chapter_attention-mechanisms-and-transformers/multihead-attention.md b/chapter_attention-mechanisms-and-transformers/multihead-attention.md
@@ -136,13 +136,13 @@ class MultiHeadAttention(d2l.Module):  #@save
                  **kwargs):
         super().__init__()
         self.num_heads = num_heads
-        self.attention = d2l.DotProductAttention(dropout, num_heads)
+        self.attention = d2l.DotProductAttention(dropout)
         self.W_q = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
         self.W_k = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
         self.W_v = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
         self.W_o = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
 
-    def forward(self, queries, keys, values, valid_lens, window_mask=None):
+    def forward(self, queries, keys, values, valid_lens):
         # Shape of queries, keys, or values:
         # (batch_size, no. of queries or key-value pairs, num_hiddens)
         # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
@@ -160,8 +160,7 @@ class MultiHeadAttention(d2l.Module):  #@save
 
         # Shape of output: (batch_size * num_heads, no. of queries,
         # num_hiddens / num_heads)
-        output = self.attention(queries, keys, values, valid_lens,
-                                window_mask)
+        output = self.attention(queries, keys, values, valid_lens)
         
         # Shape of output_concat: (batch_size, no. of queries, num_hiddens)
         output_concat = self.transpose_output(output)
@@ -175,13 +174,13 @@ class MultiHeadAttention(d2l.Module):  #@save
     def __init__(self, num_hiddens, num_heads, dropout, bias=False, **kwargs):
         super().__init__()
         self.num_heads = num_heads
-        self.attention = d2l.DotProductAttention(dropout, num_heads)
+        self.attention = d2l.DotProductAttention(dropout)
         self.W_q = nn.LazyLinear(num_hiddens, bias=bias)
         self.W_k = nn.LazyLinear(num_hiddens, bias=bias)
         self.W_v = nn.LazyLinear(num_hiddens, bias=bias)
         self.W_o = nn.LazyLinear(num_hiddens, bias=bias)
 
-    def forward(self, queries, keys, values, valid_lens, window_mask=None):
+    def forward(self, queries, keys, values, valid_lens):
         # Shape of queries, keys, or values:
         # (batch_size, no. of queries or key-value pairs, num_hiddens)
         # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
@@ -200,8 +199,7 @@ class MultiHeadAttention(d2l.Module):  #@save
 
         # Shape of output: (batch_size * num_heads, no. of queries,
         # num_hiddens / num_heads)
-        output = self.attention(queries, keys, values, valid_lens,
-                                window_mask)
+        output = self.attention(queries, keys, values, valid_lens)
         # Shape of output_concat: (batch_size, no. of queries, num_hiddens)
         output_concat = self.transpose_output(output)
         return self.W_o(output_concat)
@@ -215,14 +213,13 @@ class MultiHeadAttention(d2l.Module):  #@save
                  num_heads, dropout, bias=False, **kwargs):
         super().__init__()
         self.num_heads = num_heads
-        self.attention = d2l.DotProductAttention(dropout, num_heads)
+        self.attention = d2l.DotProductAttention(dropout)
         self.W_q = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
         self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
         self.W_v = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
         self.W_o = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
     
-    def call(self, queries, keys, values, valid_lens, window_mask=None,
-             **kwargs):
+    def call(self, queries, keys, values, valid_lens, **kwargs):
         # Shape of queries, keys, or values:
         # (batch_size, no. of queries or key-value pairs, num_hiddens)
         # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
@@ -240,8 +237,7 @@ class MultiHeadAttention(d2l.Module):  #@save
             
         # Shape of output: (batch_size * num_heads, no. of queries,
         # num_hiddens / num_heads)
-        output = self.attention(queries, keys, values, valid_lens,
-                                window_mask, **kwargs)
+        output = self.attention(queries, keys, values, valid_lens, **kwargs)
         
         # Shape of output_concat: (batch_size, no. of queries, num_hiddens)
         output_concat = self.transpose_output(output)
@@ -257,15 +253,14 @@ class MultiHeadAttention(nn.Module):  #@save
     bias: bool = False
 
     def setup(self):
-        self.attention = d2l.DotProductAttention(self.dropout, self.num_heads)
+        self.attention = d2l.DotProductAttention(self.dropout)
         self.W_q = nn.Dense(self.num_hiddens, use_bias=self.bias)
         self.W_k = nn.Dense(self.num_hiddens, use_bias=self.bias)
         self.W_v = nn.Dense(self.num_hiddens, use_bias=self.bias)
         self.W_o = nn.Dense(self.num_hiddens, use_bias=self.bias)
 
     @nn.compact
-    def __call__(self, queries, keys, values, valid_lens,
-                 window_mask=None, training=False):
+    def __call__(self, queries, keys, values, valid_lens, training=False):
         # Shape of queries, keys, or values:
         # (batch_size, no. of queries or key-value pairs, num_hiddens)
         # Shape of valid_lens: (batch_size,) or (batch_size, no. of queries)
@@ -283,9 +278,8 @@ class MultiHeadAttention(nn.Module):  #@save
 
         # Shape of output: (batch_size * num_heads, no. of queries,
         # num_hiddens / num_heads)
-        output, attention_weights = self.attention(queries, keys, values,
-                                                   valid_lens, window_mask,
-                                                   training=training)
+        output, attention_weights = self.attention(
+            queries, keys, values, valid_lens, training=training)
         # Shape of output_concat: (batch_size, no. of queries, num_hiddens)
         output_concat = self.transpose_output(output)
         return self.W_o(output_concat), attention_weights

diff --git a/...attention-mechanisms-and-transformers/self-attention-and-positional-encoding.md b/...attention-mechanisms-and-transformers/self-attention-and-positional-encoding.md
@@ -492,7 +492,6 @@ To use the sequence order information,
 we can inject absolute or relative positional information 
 by adding positional encoding to the input representations.
 
-
 ## Exercises
 
 1. Suppose that we design a deep architecture to represent a sequence by stacking self-attention layers with positional encoding. What could be issues?

diff --git a/chapter_attention-mechanisms-and-transformers/transformer.md b/chapter_attention-mechanisms-and-transformers/transformer.md
@@ -1389,6 +1389,7 @@ are important for training a very deep model.
 The positionwise feed-forward network in the Transformer model
 transforms the representation at all the sequence positions using the same MLP.
 
+
 ## Exercises
 
 1. Train a deeper Transformer in the experiments. How does it affect the training speed and the translation performance?

diff --git a/chapter_attention-mechanisms-and-transformers/vision-transformer.md b/chapter_attention-mechanisms-and-transformers/vision-transformer.md
@@ -397,7 +397,6 @@ and added back convolution-like priors,
 extending the applicability of Transformers to a range of computer vision tasks 
 beyond image classification with state-of-the-art results :cite:`liu2021swin`.
 
-
 ## Exercises
 
 1. How does the value of `img_size` affect training time?

diff --git a/chapter_natural-language-processing-applications/finetuning-bert.md b/chapter_natural-language-processing-applications/finetuning-bert.md
@@ -1,6 +1,7 @@
 # Fine-Tuning BERT for Sequence-Level and Token-Level Applications
 :label:`sec_finetuning-bert`
 
+
 In the previous sections of this chapter,
 we have designed different models for natural language processing applications,
 such as based on RNNs, CNNs, attention, and MLPs.

diff --git a/...ter_natural-language-processing-applications/natural-language-inference-bert.md b/...ter_natural-language-processing-applications/natural-language-inference-bert.md
@@ -1,7 +1,6 @@
 # Natural Language Inference: Fine-Tuning BERT
 :label:`sec_natural-language-inference-bert`
 
-
 In earlier sections of this chapter,
 we have designed an attention-based architecture
 (in :numref:`sec_natural-language-inference-attention`)

diff --git a/chapter_natural-language-processing-pretraining/bert-dataset.md b/chapter_natural-language-processing-pretraining/bert-dataset.md
@@ -418,7 +418,6 @@ len(vocab)
 * We can arbitrarily access the pretraining (masked language modeling and next sentence prediction) examples generated from a pair of sentences from the WikiText-2 corpus.
 
 
-
 ## Exercises
 
 1. For simplicity, the period is used as the only delimiter for splitting sentences. Try other sentence splitting techniques, such as the spaCy and NLTK. Take NLTK as an example. You need to install NLTK first: `pip install nltk`. In the code, first `import nltk`. Then, download the Punkt sentence tokenizer: `nltk.download('punkt')`. To split sentences such as `sentences = 'This is great ! Why not ?'`, invoking `nltk.tokenize.sent_tokenize(sentences)` will return a list of two sentence strings: `['This is great !', 'Why not ?']`.

diff --git a/chapter_natural-language-processing-pretraining/bert-pretraining.md b/chapter_natural-language-processing-pretraining/bert-pretraining.md
@@ -295,7 +295,6 @@ encoded_pair.shape, encoded_pair_cls.shape, encoded_pair_crane[0][:3]
 In :numref:`chap_nlp_app`, we will fine-tune a pretrained BERT model
 for downstream natural language processing applications.
 
-
 ## Summary
 
 * The original BERT has two versions, where the base model has 110 million parameters and the large model has 340 million parameters.

diff --git a/chapter_natural-language-processing-pretraining/bert.md b/chapter_natural-language-processing-pretraining/bert.md
@@ -603,7 +603,6 @@ class BERTModel(nn.Module):
 * Pretraining BERT is composed of two tasks: masked language modeling and next sentence prediction. The former is able to encode bidirectional context for representing words, while the latter explicitly models the logical relationship between text pairs.
 
 
-
 ## Exercises
 
 1. All other things being equal, will a masked language model require more or fewer pretraining steps to converge than a left-to-right language model? Why?

diff --git a/chapter_recurrent-neural-networks/language-model.md b/chapter_recurrent-neural-networks/language-model.md
@@ -35,7 +35,8 @@ $$P(x_1, x_2, \ldots, x_T) = \prod_{t=1}^T P(x_t  \mid  x_1, \ldots, x_{t-1}).$$
 For example, 
 the probability of a text sequence containing four words would be given as:
 
-$$P(\text{deep}, \text{learning}, \text{is}, \text{fun}) =  P(\text{deep}) P(\text{learning}  \mid  \text{deep}) P(\text{is}  \mid  \text{deep}, \text{learning}) P(\text{fun}  \mid  \text{deep}, \text{learning}, \text{is}).$$
+$$\begin{aligned}&P(\text{deep}, \text{learning}, \text{is}, \text{fun}) \\
+=&P(\text{deep}) P(\text{learning}  \mid  \text{deep}) P(\text{is}  \mid  \text{deep}, \text{learning}) P(\text{fun}  \mid  \text{deep}, \text{learning}, \text{is}).\end{aligned}$$
 
 ### Markov Models and $n$-grams
 

diff --git a/chapter_recurrent-neural-networks/sequence.md b/chapter_recurrent-neural-networks/sequence.md
@@ -161,7 +161,7 @@ $P(x_t \mid x_{t-1}, \ldots, x_1)$
 or some statistic(s) of this distribution.
 
 A few strategies recur frequently.
-First, we might believe that although long sequences
+First of all, we might believe that although long sequences
 $x_{t-1}, \ldots, x_1$ are available,
 it may not be necessary
 to look back so far in the history
-Original file line number
+Diff line change
@@ Expand Up / @@ -106,7 +106,7 @@ jupyter notebook --generate-config @@
     ```
-    Then, add the following line to the end of the Jupyter Notebook configuration file (for Linux/macOS, usually in the path `~/.jupyter/jupyter_notebook_config.py`):
+    Then, add the following line to the end of the Jupyter Notebook configuration file (for Linux or macOS, usually in the path `~/.jupyter/jupyter_notebook_config.py`):
     ```
     c.NotebookApp.contents_manager_class = 'notedown.NotedownContentsManager'
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -418,7 +418,6 @@ len(vocab)
		* We can arbitrarily access the pretraining (masked language modeling and next sentence prediction) examples generated from a pair of sentences from the WikiText-2 corpus.



		## Exercises

		1. For simplicity, the period is used as the only delimiter for splitting sentences. Try other sentence splitting techniques, such as the spaCy and NLTK. Take NLTK as an example. You need to install NLTK first: `pip install nltk`. In the code, first `import nltk`. Then, download the Punkt sentence tokenizer: `nltk.download('punkt')`. To split sentences such as `sentences = 'This is great ! Why not ?'`, invoking `nltk.tokenize.sent_tokenize(sentences)` will return a list of two sentence strings: `['This is great !', 'Why not ?']`.
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -603,7 +603,6 @@ class BERTModel(nn.Module):
		* Pretraining BERT is composed of two tasks: masked language modeling and next sentence prediction. The former is able to encode bidirectional context for representing words, while the latter explicitly models the logical relationship between text pairs.



		## Exercises

		1. All other things being equal, will a masked language model require more or fewer pretraining steps to converge than a left-to-right language model? Why?
Expand Down