diff --git a/README.md b/README.md index fa7ecd2..e5f38bd 100644 --- a/README.md +++ b/README.md @@ -234,16 +234,16 @@ class EncoderLayer(nn.Module): x = self.attention(q=x, k=x, v=x, mask=src_mask) # 2. add and norm - x = self.norm1(x + _x) x = self.dropout1(x) + x = self.norm1(x + _x) # 3. positionwise feed forward network _x = x x = self.ffn(x) # 4. add and norm - x = self.norm2(x + _x) x = self.dropout2(x) + x = self.norm2(x + _x) return x ```
@@ -298,8 +298,8 @@ class DecoderLayer(nn.Module): x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask) # 2. add and norm - x = self.norm1(x + _x) x = self.dropout1(x) + x = self.norm1(x + _x) if enc is not None: # 3. compute encoder - decoder attention @@ -307,16 +307,16 @@ class DecoderLayer(nn.Module): x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask) # 4. add and norm - x = self.norm2(x + _x) x = self.dropout2(x) + x = self.norm2(x + _x) # 5. positionwise feed forward network _x = x x = self.ffn(x) # 6. add and norm - x = self.norm3(x + _x) x = self.dropout3(x) + x = self.norm3(x + _x) return x ```