From 58653b8c39205736610242a093b7be9fd9856ae5 Mon Sep 17 00:00:00 2001
From: Sidd Karamcheti <skaramcheti@cs.stanford.edu>
Date: Sun, 10 Apr 2022 19:12:59 -0400
Subject: [PATCH 1/3] Bump README

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 75fcf89..26bcae5 100644
--- a/README.md
+++ b/README.md
@@ -37,9 +37,13 @@ Gets "best" 97.76% accuracy in 10 epochs @ 40s/epoch on a TitanRTX.
 ```
 # Following @frederick0329's/@albertgu's results: https://github.com/srush/annotated-s4/pull/43#issuecomment-1065444261
 python -m s4.train --dataset cifar-classification --model s4 --epoch 100 --bsz 64 --n_layers 6 --p_dropout 0.25 --lr 5e-3 --d_model 512
+
+# DSS Model
+python -m s4.train --dataset cifar-classification --model dss --epoch 100 --bsz 64 --n_layers 6 --p_dropout 0.25 --lr 5e-3 --d_model 512
 ```
 
-Gets "best" 85.81% accuracy after 100 epochs @ 3m8s/epoch on a TitanRTX
+S4 gets "best" 87.05% accuracy after 100 epochs @ 3m8s/epoch on a TitanRTX
+DSS gets "best" 88.90% accuracy after 100 epochs @ 3m11s/epoch on a TitanRTX
 
 ---
 

From 740fc01220ec337d93a895184a3e71885e86abcd Mon Sep 17 00:00:00 2001
From: Sidd Karamcheti <skaramcheti@cs.stanford.edu>
Date: Mon, 11 Apr 2022 14:58:59 -0400
Subject: [PATCH 2/3] Add outline for DSS post

---
 s4/dss.py | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 s4/s4.py  |   2 +-
 2 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/s4/dss.py b/s4/dss.py
index fe43061..44ec7dd 100644
--- a/s4/dss.py
+++ b/s4/dss.py
@@ -1,14 +1,124 @@
-import s4.s4 as s4
-
+# <center><h1> The Diagonal State Space Model </h1></center>
+#
+#
+# <center>
+# <p><a href="https://arxiv.org/abs/2203.14343">Diagonal State Spaces are as Effective as Structured State Spaces</a></p>
+# </center>
+#
+# <center>
+# <p> Ankit Gupta</p>
+#
+# ---
+#
+# *Note: This page is meant as a standalone complement to Section 2 [TODO Link] of the original
+# blog post.*
+#
+# The months following the release of S4 paper by Gu et. al. were characterized by a wave of excitement around the new
+# model, it's ability to handle extremely long sequences, and generally, what such a departure from Transformer-based
+# architectures could mean. The original authors came out with a
+# [follow-up paper applying S4 to audio generation](https://arxiv.org/abs/2202.09729), and weeks later, a completely
+# [different group applied S4 to long-range movie clip classification](https://arxiv.org/abs/2204.01692).
+#
+# Yet, it remains hard to parse aspects of the implementation, especially the derivation of the diagonal plus low rank
+# constraint on $\boldsymbol{A}$. Not only was this math fairly complex, but in code, required the use of custom CUDA
+# kernels -- further obfuscating the implementation (and why this blog uses Jax to efficiently compile the relevant
+# operations).
+#
+# However, at the end of March 2022 -- an alternative construction for state space models was proposed in [Diagonal
+# State Spaces are as Effective as Structured State Spaces](https://arxiv.org/abs/2203.14343). This short paper derives
+# an alternative construction of learnable state space models that is both 1) simple, 2) requires no custom kernels, and
+# 3) can be efficiently implemented in Jax or PyTorch in just a dozen lines. The rest of this post steps through this
+# alternative derivation, **a complete standalone for Section 2** of the original Annotated S4 post.
+#
+# We'll still be using Jax with the Flax NN Library for consistency with the original post, though this Diagonal State
+# Space (DSS) variant can be easily implemented in PyTorch with some minor changes.
+
+# import s4.s4 as s4  TODO -- For some reason breaks streamlit...
+import s4
 from functools import partial
 import jax
 import jax.numpy as np
 from flax import linen as nn
 from jax.nn.initializers import lecun_normal
-from jax.numpy.linalg import eig
 
 rng = jax.random.PRNGKey(1)
 
+# ## Table of Contents
+# <nav id="TOC">
+# <ul>
+#   <li>Step 1. The Problem with the SSM Convolutional Kernel
+#       <ul>
+#           <li>Rethinking Discretization</li>
+#           <li>Rewriting the SSM Kernel</li>
+#           <li>Diagonalization & Efficient Matrix Powers</li>
+#       <ul>
+#   </li>
+#   <li>Step 2. Deriving the Diagonal State Space Model
+#       <ul>
+#           <li>Proving Proposition 1 from the DSS Paper</li>
+#           <li>Secret Sauce 1: Handling the Complex Softmax</li>
+#           <li>Secret Sauce 2: Initializing with the HiPPO Matrix</li>
+#       </ul>
+#   </li>
+#   <li>Step 3. Putting the DSS Layer Together
+#       <ul>
+#           <li>The DSS Block</li>
+#           <li>Limitations</li>
+#       </ul>
+#   </li>
+# </ul>
+
+
+# ## Step 1. The Problem with the SSM Convolutional Kernel
+#
+# We're going to start by taking a step back – back to the original State Space model formulation itself.
+#
+# ### Rethinking Discretization
+# - Sketch SSM as an ODE
+# - Motivate need for discretization... how do we discretize? Bilinear method is what S4 uses, but you can also just
+# *solve the ODE directly* (yields $\bar{\boldsymbol{A}} = e^{\boldsymbol{A}\Delta}$).
+#
+# ### Rewriting the SSM Kernel
+# - Pull in equation from Part 1 for the kernel.
+# - Note repeated multiplication by A (matrix power)
+# - Time complexity of matrix power sucks!
+# - Unless... *diagonalization*
+#
+# ### Diagonalization & Efficient Matrix Powers
+# - If we can find a way to write $\bar{\boldsymbol{A}}$ as a diagonal matrix, the matrix power defining the kernel
+# becomes *trivial*.
+# - How?
+
+
+# ## Step 2. Deriving the Diagonal State Space Model
+# Given the benefits of diagonalization, how do we construct a diagonal $\bar{\boldsymbol{A}}$ that leads to efficient
+# computation of the SSM kernel $\bar{\boldsymbol{K}}$?
+#
+# ### Proposition 1 from the DSS Paper
+# - Step through original proposition
+# - Step through proof in Appendix (simplified)
+#
+# ### Secret Sauce 1: Complex Softmax
+# - Part of the reason this initialization works is because we're initializing our diagonal matrix $\Lambda$ in Complex
+# space.
+# - This means that our typical softmax stops behaving well... so we need to fix it!
+#
+# ### Secret Sauce 2: Initializing with the HiPPO Matrix
+# - Stability is still tricky
+# - HiPPO theory is still necessary (at the beginning) for initializing our weights.
+
+
+# ## Step 3. Putting the DSS Layer Together
+# Mostly just define the DSS Layer and DSSInit function, as well as the final test.
+#
+# ### Limitations
+# - RNN Autoregressive Usage (still being worked out)
+# - Still not as performant as S4 in certain settings (expressivity)
+# - Still tied to HiPPO theory
+
+
+
+## TODO -- Need to weave these parts through the sections above...
 
 def complex_softmax(x, eps=1e-7):
     def reciprocal(x):
@@ -31,7 +141,7 @@ def dss_ssm(W, Lambda, L, step):
                  1 / (l * (np.exp(l * np.arange(L) * step)).sum()))
     Bbar = b(Lambda).reshape(N, 1)
     Cbar = W.reshape(1, N)
-    return (Abar, Bbar, Cbar)
+    return Abar, Bbar, Cbar
 
 
 class DSSLayer(nn.Module):
@@ -64,7 +174,7 @@ def init_discrete():
             self.x_k_1 = self.variable(
                 "cache", "cache_x_k", np.zeros, (self.N,), np.complex64
             )
-            
+
     def __call__(self, u):
         if not self.decode:
             return s4.non_circular_convolution(u, self.K) + self.D * u
@@ -85,14 +195,13 @@ def DSSLayerInit(N):
 
 
 def test_conversion(N=8, L=16):
-    "Maybe this a general test?"
+    """Maybe this a general test?"""
     step = 1.0 / L
     W = lecun_normal()(rng, (1, N, 2))
     W = W[..., 0] + 1j * W[..., 1]
     _, Lambda, _, _, _ = s4.make_NPLR_HiPPO(2 * N)
     Lambda = Lambda[np.nonzero(Lambda.imag > 0, size=N)]
 
-    
     K = dss_kernel(W, Lambda, L, step)
     ssm = dss_ssm(W, Lambda, L, step)
 
diff --git a/s4/s4.py b/s4/s4.py
index 8bd0604..8adcf53 100644
--- a/s4/s4.py
+++ b/s4/s4.py
@@ -546,7 +546,7 @@ def init(key, shape):
     return init
 
 
-# For the SMM layer most of the work is to build the filter.
+# For the SSM layer most of the work is to build the filter.
 # The actual call to the network is just the (huge) convolution we specified above.
 #
 # Note for Torch users: `setup` in Flax is called each time the parameters are updated.

From 8c9a6ed8204fe565779c9e9ea10bdfd9231e2a70 Mon Sep 17 00:00:00 2001
From: Sidd Karamcheti <skaramcheti@cs.stanford.edu>
Date: Mon, 2 May 2022 17:39:24 -0700
Subject: [PATCH 3/3] Add DSS post [WIP]

---
 Makefile  |  10 +-
 s4.ipynb  | 346 +++++++++++++++++++++++----------------------
 s4/dss.py | 413 ++++++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 521 insertions(+), 248 deletions(-)

diff --git a/Makefile b/Makefile
index 515b0fc..1ff05da 100644
--- a/Makefile
+++ b/Makefile
@@ -25,15 +25,19 @@ autoformat:
 	black s4/s4.py s4/data.py s4/train.py s4/sample.py
 	flake8 --show-source s4/s4.py s4/data.py s4/train.py s4/sample.py
 
-notebook: s4/s4.py
+notebook: s4/s4.py s4/dss.py
 	jupytext --to notebook s4/s4.py -o s4.ipynb
+	jupytext --to notebook s4/dss.py -o dss.ipynb
 
-html: s4/s4.py
+html: s4/s4.py s4/dss.py
 	jupytext --to notebook s4/s4.py -o s4.ipynb
 	jupyter nbconvert --to html s4.ipynb
+	jupytext --to notebook s4/dss.py -o dss.ipynb
+	jupyter nbconvert --to html dss.ipynb
 
-s4/s4.md: s4/s4.py
+s4/s4.md: s4/s4.py s4/dss.py
 	jupytext --to markdown s4/s4.py
+	jupytext --to markdown s4/dss.py
 
 blog: s4/s4.md
 	pandoc docs/header-includes.yaml s4/s4.md  --katex=/usr/local/lib/node_modules/katex/dist/ --output=docs/index.html --to=html5 --css=docs/github.min.css --css=docs/tufte.css --no-highlight --self-contained --metadata pagetitle="The Annotated S4"
diff --git a/s4.ipynb b/s4.ipynb
index 68586aa..335857d 100644
--- a/s4.ipynb
+++ b/s4.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "3ac06c5e",
+   "id": "6228a40d",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -22,7 +22,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9baee6b0",
+   "id": "31a052cb",
    "metadata": {},
    "source": [
     "*Blog Post and [Library](https://github.com/srush/annotated-s4/) by [Sasha Rush](http://rush-nlp.com/) and [Sidd Karamcheti](https://www.siddkaramcheti.com/)*, v2"
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "925fd4d2",
+   "id": "83a16043",
    "metadata": {},
    "source": [
     "\n",
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "12066a9d",
+   "id": "ac46bab2",
    "metadata": {},
    "source": [
     "<img src=\"images/table.png\" width=\"100%\"/>"
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "88da8a70",
+   "id": "e48404e8",
    "metadata": {},
    "source": [
     "The paper is also a refreshing departure from Transformers, taking a\n",
@@ -84,7 +84,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f170d1e3",
+   "id": "20ae99da",
    "metadata": {},
    "source": [
     "## Table of Contents"
@@ -92,7 +92,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "281b75b8",
+   "id": "c975e7d1",
    "metadata": {},
    "source": [
     "<nav id=\"TOC\">\n",
@@ -126,7 +126,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fd1aa624",
+   "id": "9d16cc80",
    "metadata": {},
    "source": [
     "Note that this project uses [JAX](https://github.com/google/jax/)\n",
@@ -145,7 +145,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "796f0ace",
+   "id": "e2276130",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -163,7 +163,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d38b0c41",
+   "id": "8f3eda07",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -174,7 +174,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "71f77754",
+   "id": "38e7daf4",
    "metadata": {},
    "source": [
     "## Part 1: State Space Models"
@@ -182,7 +182,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2c58cb63",
+   "id": "90c46d6b",
    "metadata": {},
    "source": [
     "Let's get started! Our goal is the efficient\n",
@@ -195,7 +195,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1ff326b6",
+   "id": "8aed7501",
    "metadata": {},
    "source": [
     "> The [state space model](https://en.wikipedia.org/wiki/State-space_representation) is defined by this simple equation.\n",
@@ -222,7 +222,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d7b8b3e9",
+   "id": "5bce223e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -234,7 +234,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a3b2ac1c",
+   "id": "eb421900",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -250,7 +250,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a0c7c428",
+   "id": "6c58b9e4",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -279,7 +279,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2379071e",
+   "id": "fe2a0cb7",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -295,7 +295,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fd08eb1a",
+   "id": "f3b98baa",
    "metadata": {},
    "source": [
     "> This equation is now a *sequence-to-sequence* map $u_k \\mapsto y_k$ instead of function-to-function.\n",
@@ -311,7 +311,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4ce8ee3a",
+   "id": "babd5006",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -325,7 +325,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "53322caa",
+   "id": "a2b5d8da",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -342,7 +342,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9159a76a",
+   "id": "b890acba",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -354,7 +354,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "03f627d1",
+   "id": "dc1f05f2",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -371,7 +371,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c2f451b5",
+   "id": "026ab57a",
    "metadata": {},
    "source": [
     "### Tangent: A Mechanics Example"
@@ -379,7 +379,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d19bf61b",
+   "id": "309aa6b7",
    "metadata": {},
    "source": [
     " To gain some more intuition and test our SSM implementation, we pause\n",
@@ -392,7 +392,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2d6908bf",
+   "id": "d0d710c5",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -405,7 +405,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "dfc26927",
+   "id": "a8dfe3a2",
    "metadata": {},
    "source": [
     "Rewriting this in matrix form yields an SSM in the following form:"
@@ -413,7 +413,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f8809567",
+   "id": "5f03a622",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -429,7 +429,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "678002ca",
+   "id": "738cbb24",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -444,7 +444,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b2ecbc3e",
+   "id": "fbefe65a",
    "metadata": {},
    "source": [
     " Looking at the $\\boldsymbol{C}$, we should be able to convince ourselves that the\n",
@@ -455,7 +455,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "820f39cb",
+   "id": "bc2fdc95",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -466,7 +466,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a3668e3a",
+   "id": "1e6b1538",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -480,7 +480,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f840a422",
+   "id": "60aec1b8",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -491,7 +491,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f414e465",
+   "id": "9381b4e9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -541,7 +541,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2abba1de",
+   "id": "43d4d671",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -551,7 +551,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "92932da2",
+   "id": "a01712d7",
    "metadata": {},
    "source": [
     "<img src=\"line.gif\" width=\"100%\">"
@@ -559,7 +559,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ffd57785",
+   "id": "b751f097",
    "metadata": {},
    "source": [
     "Neat! And that it was just 1 SSM, with 2 hidden states over 100 steps.\n",
@@ -569,7 +569,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a6d271f7",
+   "id": "3d6dfab4",
    "metadata": {},
    "source": [
     "### Training SSMs: The Convolutional Representation"
@@ -577,7 +577,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "136e3611",
+   "id": "7ca79239",
    "metadata": {},
    "source": [
     "The punchline of this section is that we can turn the \"RNN\" above into a \"CNN\"\n",
@@ -586,7 +586,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3029d7e9",
+   "id": "4459c236",
    "metadata": {},
    "source": [
     "> The recurrent SSM is not practical for training on modern hardware\n",
@@ -630,7 +630,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "855b4bc0",
+   "id": "0548f0a8",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -641,7 +641,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9134faae",
+   "id": "00a046ea",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -655,7 +655,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5996a8b0",
+   "id": "5fde88df",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -667,7 +667,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b6e68c6b",
+   "id": "d218a7e0",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -680,7 +680,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "df74381f",
+   "id": "26bef27e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -699,7 +699,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fddf4a38",
+   "id": "268e085c",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -710,7 +710,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "be6836db",
+   "id": "a55229c4",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -733,7 +733,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "202fafa9",
+   "id": "ce2e04bc",
    "metadata": {},
    "source": [
     "At this point we have all of the machinery used for SSM training. The next\n",
@@ -742,7 +742,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2e851844",
+   "id": "b32adca7",
    "metadata": {},
    "source": [
     "### Addressing Long-Range Dependencies with HiPPO"
@@ -750,7 +750,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bceead60",
+   "id": "4dbd8a47",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -786,7 +786,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cf1b9a5c",
+   "id": "33a204d3",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -802,7 +802,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9e43e0e9",
+   "id": "1b164fac",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -824,7 +824,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a39ea3f7",
+   "id": "09cc1b1a",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -840,7 +840,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4d0fd041",
+   "id": "21144c6a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -894,7 +894,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5c933dad",
+   "id": "13362641",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -904,7 +904,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "11a3c622",
+   "id": "5ee4bb46",
    "metadata": {},
    "source": [
     "The red line represents that curve we are approximating,\n",
@@ -916,7 +916,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8011a1ce",
+   "id": "bf97dc34",
    "metadata": {},
    "source": [
     "<img src=\"images/leg.png\" width=\"100%\">"
@@ -924,7 +924,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "53615603",
+   "id": "4a449e45",
    "metadata": {},
    "source": [
     "### An SSM Neural Network."
@@ -932,7 +932,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a22eee2b",
+   "id": "1d9b3453",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -948,7 +948,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ad9211d3",
+   "id": "0ef40970",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -965,12 +965,12 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6086baf3",
+   "id": "124154dc",
    "metadata": {
     "lines_to_next_cell": 2
    },
    "source": [
-    "For the SMM layer most of the work is to build the filter.\n",
+    "For the SSM layer most of the work is to build the filter.\n",
     "The actual call to the network is just the (huge) convolution we specified above.\n",
     "\n",
     "Note for Torch users: `setup` in Flax is called each time the parameters are updated.\n",
@@ -985,7 +985,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "05ea0fba",
+   "id": "0a756f1c",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1027,7 +1027,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "44f9fb15",
+   "id": "2c39cc29",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1041,7 +1041,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3f0a3bf0",
+   "id": "0e042c7e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1059,7 +1059,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0cc16c28",
+   "id": "5ab89c58",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1070,7 +1070,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0e51b570",
+   "id": "e2f950cf",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1082,7 +1082,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "25a21296",
+   "id": "5cdcdbe6",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1095,7 +1095,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d9b088e0",
+   "id": "e96c2a6e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1127,7 +1127,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "71d7363d",
+   "id": "bf79e090",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1140,7 +1140,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d84ae631",
+   "id": "01ed1bea",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1184,7 +1184,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "82a8ef1b",
+   "id": "c855e827",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1197,7 +1197,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e222f454",
+   "id": "ad31b522",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1214,7 +1214,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c093e8b2",
+   "id": "f2674b3d",
    "metadata": {},
    "source": [
     "Overall, this defines a sequence-to-sequence map of shape (batch size, sequence length, hidden dimension),\n",
@@ -1223,7 +1223,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "86051b63",
+   "id": "ccf84679",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1236,7 +1236,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8427a294",
+   "id": "ac72cae0",
    "metadata": {},
    "source": [
     "## Part 2: Implementing S4"
@@ -1244,7 +1244,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "daab2818",
+   "id": "616bfbff",
    "metadata": {},
    "source": [
     "Warning: this section has a lot of math. Roughly it boils down to finding a\n",
@@ -1255,7 +1255,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "22e0ac35",
+   "id": "823d0925",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1265,7 +1265,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cfa7af40",
+   "id": "8caf0961",
    "metadata": {},
    "source": [
     "> The fundamental bottleneck in computing the discrete-time SSM\n",
@@ -1278,7 +1278,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "41eaa4dd",
+   "id": "50e93db5",
    "metadata": {},
    "source": [
     "Specifically, recall this function here:"
@@ -1286,7 +1286,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0cbfccee",
+   "id": "9069c8e2",
    "metadata": {},
    "source": [
     "```python\n",
@@ -1299,7 +1299,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "253594e0",
+   "id": "bf3978ec",
    "metadata": {},
    "source": [
     "The contribution of S4 is a stable method for speeding up this particular operation.\n",
@@ -1310,7 +1310,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fbb623bf",
+   "id": "1487be39",
    "metadata": {},
    "source": [
     "**DPLR:** SSM is  $(\\boldsymbol{\\Lambda} - \\boldsymbol{p}\\boldsymbol{q}^*, \\boldsymbol{B}, \\boldsymbol{C})$ for some diagonal $\\boldsymbol{\\Lambda}$ and vectors $\\boldsymbol{p}, \\boldsymbol{q}, \\boldsymbol{B}, \\boldsymbol{C} \\in \\mathbb{C}^{N \\times 1}$.\n",
@@ -1321,7 +1321,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c42f407a",
+   "id": "42b10f48",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1335,7 +1335,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b6da1976",
+   "id": "592cf2a3",
    "metadata": {},
    "source": [
     "### Step 1. SSM Generating Functions"
@@ -1343,7 +1343,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b8ee4ab6",
+   "id": "c0bf3dc7",
    "metadata": {},
    "source": [
     "The main step will be switching from computing the sequence to computing its generating function.\n",
@@ -1352,7 +1352,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "40115e6a",
+   "id": "da464939",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1370,7 +1370,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8f5e6afe",
+   "id": "662a0e74",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1387,7 +1387,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2c640217",
+   "id": "7ae03f39",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1402,7 +1402,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0d38271b",
+   "id": "fcaa6300",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1420,7 +1420,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f62f9ead",
+   "id": "000f6a74",
    "metadata": {},
    "source": [
     "More importantly, in the generating function we can replace the matrix power with an inverse!\n",
@@ -1431,7 +1431,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "08d06c93",
+   "id": "7758ccf0",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1443,7 +1443,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0f83b269",
+   "id": "59ddbd36",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1458,7 +1458,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "79a89999",
+   "id": "27084135",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1469,7 +1469,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "da209e6b",
+   "id": "8940a52a",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1486,7 +1486,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "11ed11dd",
+   "id": "b037bd83",
    "metadata": {},
    "source": [
     " In summary, Step 1 allows us to replace the matrix power with an\n",
@@ -1497,7 +1497,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "08cf8e3b",
+   "id": "8844cee3",
    "metadata": {},
    "source": [
     "### Step 2: Diagonal Case"
@@ -1505,7 +1505,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a0f066f9",
+   "id": "ed0a2f16",
    "metadata": {},
    "source": [
     "The next step to assume special *structure* on the matrix\n",
@@ -1516,7 +1516,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "205bf2b3",
+   "id": "c73b75fc",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1532,7 +1532,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "646f4c33",
+   "id": "93b5a679",
    "metadata": {},
    "source": [
     "Now imagine $A=\\boldsymbol{\\Lambda}$ for a diagonal $\\boldsymbol{\\Lambda}$. Substituting in the discretization\n",
@@ -1541,7 +1541,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "19371ffd",
+   "id": "0279d1ac",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1554,7 +1554,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "52040573",
+   "id": "f6df16d9",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1568,7 +1568,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3a7ec302",
+   "id": "6f9ca0e7",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1581,7 +1581,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "535fa008",
+   "id": "433e5600",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1595,7 +1595,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bf9838b0",
+   "id": "62d20c69",
    "metadata": {},
    "source": [
     "### Step 3: Diagonal Plus Low-Rank"
@@ -1603,7 +1603,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e6102ef2",
+   "id": "1d0e7b2b",
    "metadata": {},
    "source": [
     "The final step is to relax the diagonal assumption. In addition to\n",
@@ -1613,60 +1613,61 @@
   },
   {
    "cell_type": "markdown",
-   "id": "10f4905b",
+   "id": "f3e2892c",
    "metadata": {},
    "source": [
     "$$\n",
-    "\\boldsymbol{A} = \\boldsymbol{\\Lambda} + \\boldsymbol{p}  \\boldsymbol{q}^*\n",
+    "\\boldsymbol{A} = \\boldsymbol{\\Lambda} - \\boldsymbol{p}  \\boldsymbol{q}^*\n",
     "$$"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "eab70027",
+   "id": "2229ff7c",
    "metadata": {},
    "source": [
     "The [Woodbury identity](https://en.wikipedia.org/wiki/Woodbury_matrix_identity)\n",
     "tells us that the inverse of a diagonal plus rank-1 term is equal to the\n",
-    "inverse of the diagonal plus a rank-1 term. Or in math:"
+    "inverse of the diagonal plus a rank-1 term. We write it out here\n",
+    "adding the low-rank term."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "de77a8f8",
+   "id": "c6268cab",
    "metadata": {},
    "source": [
     "$$ \\begin{aligned}\n",
-    "(\\boldsymbol{\\Lambda} + \\boldsymbol{p}  \\boldsymbol{q}^*)^{-1} &= \\boldsymbol{\\Lambda}^{-1} - \\boldsymbol{\\Lambda}^{-1} \\boldsymbol{p} (1 + \\boldsymbol{q}^* \\boldsymbol{p})^{-1} \\boldsymbol{q}^* \\boldsymbol{\\Lambda}^{-1}\n",
+    "(\\boldsymbol{\\Lambda} + \\boldsymbol{p}  \\boldsymbol{q}^*)^{-1} &= \\boldsymbol{\\Lambda}^{-1} - \\boldsymbol{\\Lambda}^{-1} \\boldsymbol{p} (1 + \\boldsymbol{q}^* \\boldsymbol{\\Lambda}^{-1} \\boldsymbol{p})^{-1} \\boldsymbol{q}^* \\boldsymbol{\\Lambda}^{-1}\n",
     " \\end{aligned}\n",
     "$$"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4ff3b9d5",
+   "id": "9e4e5c01",
    "metadata": {},
    "source": [
-    " There is a bunch of algebra not shown. But it mostly consists of substituting this component in for A,\n",
+    " There is a bunch of algebra in the appendix. It mostly consists of substituting this component in for A,\n",
     " applying the Woodbury identity and distributing terms. We end up with 4 terms that\n",
     " all look like Step 2 above:"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9891ab1d",
+   "id": "9824e302",
    "metadata": {
     "lines_to_next_cell": 2
    },
    "source": [
     "$$ \\begin{aligned}\n",
-    "\\boldsymbol{\\hat{K}}_{DPLR}(z) & = c(z) [k_{z, \\Lambda}(\\boldsymbol{\\tilde{C}}, \\boldsymbol{\\boldsymbol{B}}) - k_{z, \\Lambda}(\\boldsymbol{\\tilde{C}}, \\boldsymbol{\\boldsymbol{p}}) (1 - k_{z, \\Lambda}(\\boldsymbol{q^*}, \\boldsymbol{\\boldsymbol{p}}) )^{-1} k_{z, \\Lambda}(\\boldsymbol{q^*}, \\boldsymbol{\\boldsymbol{B}}) ]\n",
+    "\\boldsymbol{\\hat{K}}_{DPLR}(z) & = c(z) [k_{z, \\Lambda}(\\boldsymbol{\\tilde{C}}, \\boldsymbol{\\boldsymbol{B}}) - k_{z, \\Lambda}(\\boldsymbol{\\tilde{C}}, \\boldsymbol{\\boldsymbol{p}}) (1 + k_{z, \\Lambda}(\\boldsymbol{q^*}, \\boldsymbol{\\boldsymbol{p}}) )^{-1} k_{z, \\Lambda}(\\boldsymbol{q^*}, \\boldsymbol{\\boldsymbol{B}}) ]\n",
     " \\end{aligned}$$"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "2ce1caae",
+   "id": "67425090",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1677,7 +1678,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ca3690d4",
+   "id": "23300d0a",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1709,7 +1710,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0d790ad9",
+   "id": "c2b4c61d",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1721,7 +1722,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3c7f8288",
+   "id": "da7998df",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1739,7 +1740,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a7f4c2bc",
+   "id": "389cd7f2",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1750,7 +1751,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "16de489a",
+   "id": "3151cba3",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1780,7 +1781,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4d7ff545",
+   "id": "e9567bac",
    "metadata": {},
    "source": [
     "### Diagonal Plus Low-Rank RNN."
@@ -1788,7 +1789,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e51d7b35",
+   "id": "c94d59bd",
    "metadata": {},
    "source": [
     "A secondary benefit of the DPLR factorization is that it allows\n",
@@ -1799,7 +1800,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "255ce4d0",
+   "id": "abda597e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1857,7 +1858,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1f6f4056",
+   "id": "c6eefbc7",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1888,7 +1889,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "db8a341d",
+   "id": "0305b9d9",
    "metadata": {},
    "source": [
     "### Turning HiPPO to DPLR"
@@ -1896,7 +1897,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2623c4ba",
+   "id": "9cb6f872",
    "metadata": {},
    "source": [
     "This approach applies to DPLR matrices, but remember we would like it to also apply to the HiPPO matrix.\n",
@@ -1907,7 +1908,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "71d500fb",
+   "id": "b52dda16",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1921,7 +1922,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "993ac31d",
+   "id": "e863f586",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1935,7 +1936,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aaf4d7e2",
+   "id": "6bda1bad",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1958,7 +1959,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3ffb26e4",
+   "id": "9fbbaa07",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1969,7 +1970,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a900d8dd",
+   "id": "37c1d822",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1988,7 +1989,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9bdfd5cc",
+   "id": "3df2750a",
    "metadata": {},
    "source": [
     "### Final Check"
@@ -1996,7 +1997,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a45ba651",
+   "id": "bf5d6ace",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2007,7 +2008,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ca9ebe9e",
+   "id": "85bf95a6",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2047,7 +2048,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "56971e55",
+   "id": "53bef8a6",
    "metadata": {},
    "source": [
     "## Part 3: S4 in Practice"
@@ -2055,7 +2056,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6dd06779",
+   "id": "cc725bec",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2066,7 +2067,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c84e2a3b",
+   "id": "c9c8f37a",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2079,7 +2080,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6d83bd58",
+   "id": "20b2d76c",
    "metadata": {},
    "source": [
     "### S4 CNN / RNN Layer"
@@ -2087,7 +2088,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "34d959c1",
+   "id": "ba56d691",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2103,7 +2104,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3c1d296e",
+   "id": "91e1a1e3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2179,7 +2180,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "84cd2af9",
+   "id": "298e0533",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2188,7 +2189,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "aca8c8a9",
+   "id": "c06bdacc",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2199,7 +2200,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "917a73b7",
+   "id": "8c22d156",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2216,7 +2217,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b448af7a",
+   "id": "9633eeab",
    "metadata": {},
    "source": [
     "### Sampling and Caching"
@@ -2224,7 +2225,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "beb8fd1f",
+   "id": "2c2496e3",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2237,7 +2238,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9c74a295",
+   "id": "cf072dc6",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2266,7 +2267,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f585195e",
+   "id": "ee5a5446",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2279,7 +2280,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "29e99e40",
+   "id": "441026bc",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2305,7 +2306,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a1bcea89",
+   "id": "c6f78f28",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2316,7 +2317,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a16bbd96",
+   "id": "94f60daa",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2332,7 +2333,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8cfd09fc",
+   "id": "69c04583",
    "metadata": {},
    "source": [
     "### Experiments: MNIST"
@@ -2340,7 +2341,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b45143b2",
+   "id": "b05c25aa",
    "metadata": {},
    "source": [
     "Now that we have the model, we can try it out on some MNIST experiments.\n",
@@ -2350,7 +2351,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0f98f93e",
+   "id": "7968720e",
    "metadata": {},
    "source": [
     "The first experiments we ran were on MNIST classification. While\n",
@@ -2361,7 +2362,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1f40259b",
+   "id": "690b47ac",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2379,7 +2380,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5ecad43c",
+   "id": "15123e92",
    "metadata": {},
    "source": [
     "<img src=\"images/sample.png\" width=\"100%\">"
@@ -2387,7 +2388,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "658bc4cf",
+   "id": "6d598b43",
    "metadata": {},
    "source": [
     "We can also do prefix-samples – given the first 300 pixels, try to complete the image.\n",
@@ -2396,7 +2397,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d9b30f6b",
+   "id": "a4f6e892",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2414,7 +2415,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fac75c61",
+   "id": "9eaea944",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2476,7 +2477,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c7f20917",
+   "id": "46fa8311",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2486,7 +2487,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "88b0f9b1",
+   "id": "e0d6dea2",
    "metadata": {},
    "source": [
     "Next we tried training a model to generate drawings. For this we\n",
@@ -2502,7 +2503,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a020aa5f",
+   "id": "e89b350e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2517,7 +2518,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6c8cba3b",
+   "id": "e2618262",
    "metadata": {},
    "source": [
     "### Experiments: Spoken Digits"
@@ -2525,7 +2526,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9a34080e",
+   "id": "b1ab3b6c",
    "metadata": {},
    "source": [
     "Finally we played with modeling sound waves directly. For these, we\n",
@@ -2544,7 +2545,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fc64d406",
+   "id": "335d33ff",
    "metadata": {},
    "source": [
     "<center>\n",
@@ -2632,7 +2633,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "126f9abc",
+   "id": "f2eb8c29",
    "metadata": {},
    "source": [
     "Our [full code base](https://github.com/srush/annotated-s4/) contains\n",
@@ -2642,7 +2643,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4ce01f43",
+   "id": "e94fa42e",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -2652,7 +2653,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "39a35a96",
+   "id": "ab10466a",
    "metadata": {},
    "source": [
     "Putting together this post inspired lots of thoughts about future\n",
@@ -2668,20 +2669,23 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fc21d1e0",
+   "id": "6dad78e6",
    "metadata": {},
    "source": [
     "We end by thanking the authors [Albert Gu](http://web.stanford.edu/~albertgu/) and\n",
     "[Karan Goel](https://krandiash.github.io/), who were super helpful in\n",
     "putting this together, and pointing you again to their\n",
     "[paper](https://arxiv.org/abs/2111.00396) and\n",
-    "[codebase](https://github.com/HazyResearch/state-spaces). We're also grateful for Conner Vercellino and\n",
+    "[codebase](https://github.com/HazyResearch/state-spaces).\n",
+    "Thanks to Ankit Gupta, Ekin Akyürek, Qinsheng Zhang, Nathan Yan, and Junxiong Wang for\n",
+    "contributions.\n",
+    "We're also grateful for Conner Vercellino and\n",
     "Laurel Orr for providing helpful feedback on this post."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "b931f1d3",
+   "id": "fca5ac4e",
    "metadata": {},
    "source": [
     "\n",
diff --git a/s4/dss.py b/s4/dss.py
index 44ec7dd..01faac6 100644
--- a/s4/dss.py
+++ b/s4/dss.py
@@ -16,13 +16,13 @@
 # The months following the release of S4 paper by Gu et. al. were characterized by a wave of excitement around the new
 # model, it's ability to handle extremely long sequences, and generally, what such a departure from Transformer-based
 # architectures could mean. The original authors came out with a
-# [follow-up paper applying S4 to audio generation](https://arxiv.org/abs/2202.09729), and weeks later, a completely
-# [different group applied S4 to long-range movie clip classification](https://arxiv.org/abs/2204.01692).
+# [follow-up paper applying S4 to audio generation](https://arxiv.org/abs/2202.09729), and weeks later, a [completely
+# different group applied S4 to long-range movie clip classification](https://arxiv.org/abs/2204.01692).
 #
-# Yet, it remains hard to parse aspects of the implementation, especially the derivation of the diagonal plus low rank
-# constraint on $\boldsymbol{A}$. Not only was this math fairly complex, but in code, required the use of custom CUDA
-# kernels -- further obfuscating the implementation (and why this blog uses Jax to efficiently compile the relevant
-# operations).
+# Yet, aspects of the implementation remain hard to parse, especially the derivation of the diagonal plus low rank
+# constraint on $\boldsymbol{A}$. Not only is this math fairly complex, but in the original PyTorch code base, requires
+# the use of custom CUDA kernels -- further obfuscating the implementation (and why this blog uses Jax to efficiently
+# compile the relevant operations).
 #
 # However, at the end of March 2022 -- an alternative construction for state space models was proposed in [Diagonal
 # State Spaces are as Effective as Structured State Spaces](https://arxiv.org/abs/2203.14343). This short paper derives
@@ -46,21 +46,21 @@
 # ## Table of Contents
 # <nav id="TOC">
 # <ul>
-#   <li>Step 1. The Problem with the SSM Convolutional Kernel
+#   <li>I. The Problem with the SSM Kernel
 #       <ul>
 #           <li>Rethinking Discretization</li>
 #           <li>Rewriting the SSM Kernel</li>
 #           <li>Diagonalization & Efficient Matrix Powers</li>
 #       <ul>
 #   </li>
-#   <li>Step 2. Deriving the Diagonal State Space Model
+#   <li>II. Deriving Diagonal State Spaces
 #       <ul>
-#           <li>Proving Proposition 1 from the DSS Paper</li>
-#           <li>Secret Sauce 1: Handling the Complex Softmax</li>
-#           <li>Secret Sauce 2: Initializing with the HiPPO Matrix</li>
+#           <li>The Annotated Proposition 1</li>
+#           <li>Secret Sauce – Part 1: Handling the Complex Softmax</li>
+#           <li>Secret Sauce – Part 2: Initializing with HiPPO</li>
 #       </ul>
 #   </li>
-#   <li>Step 3. Putting the DSS Layer Together
+#   <li>III. Putting the DSS Layer Together
 #       <ul>
 #           <li>The DSS Block</li>
 #           <li>Limitations</li>
@@ -69,56 +69,224 @@
 # </ul>
 
 
-# ## Step 1. The Problem with the SSM Convolutional Kernel
+# ## I. The Problem with the SSM Kernel
 #
-# We're going to start by taking a step back – back to the original State Space model formulation itself.
+# We're going to start by taking a step back – back to the original State Space Model (SSM) itself. The original
+# SSM is defined over *continuous* time inputs, as follows (from the original S4 paper)
 #
+# **[TODO: Link to original post]**
+
+# > The [state space model](https://en.wikipedia.org/wiki/State-space_representation) is defined by this simple equation.
+# > It maps a 1-D input signal $u(t)$ to an $N$-D latent state $x(t)$
+# > before projecting to a 1-D output signal $y(t)$.
+# $$
+#   \begin{aligned}
+#     x'(t) &= \boldsymbol{A}x(t) + \boldsymbol{B}u(t) \\
+#     y(t) &= \boldsymbol{C}x(t) + \boldsymbol{D}u(t)
+#   \end{aligned}
+# $$
+# > Our goal is to simply use the SSM as a black-box representation in a deep
+# > sequence model, where $\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \boldsymbol{D}$ are
+# > parameters learned by gradient descent...
+# >
+# > An SSM maps a input $u(t)$ to a state representation vector $x(t)$ and an output $y(t)$.
+# > For simplicity, we assume the input and output are one-dimensional, and the state representation
+# > is $N$-dimensional. The first equation defines the change in $x(t)$ over time.
+
+# However, when actually training or running inference with this model, we don't take continuous inputs! Instead,
+# we usually have a need to *discretize* turning the above differential equation, into a discrete sequence-to-sequence
+# map! The key question: how do we discretize?
+
 # ### Rethinking Discretization
-# - Sketch SSM as an ODE
-# - Motivate need for discretization... how do we discretize? Bilinear method is what S4 uses, but you can also just
-# *solve the ODE directly* (yields $\bar{\boldsymbol{A}} = e^{\boldsymbol{A}\Delta}$).
 #
+# One way to discretize the state space model with with the [bilinear method](https://en.wikipedia.org/wiki/Bilinear_transform)
+# as described in the original S4 work. This has certain advantages such as **[TODO: advantages of bilinear?]**.
+#
+# However, a simpler approach to discretizing the SSM is by directly writing each equation in terms of a fixed
+# sampling interval $\Delta$, and a discrete index $k$. Doing so results in the following simple system of equations:
+
+# $$
+#   \begin{aligned}
+#     x((k + 1) \Delta) &= \boldsymbol{\overline{A}}x(k \Delta) + \boldsymbol{\overline{B}} u(k \Delta) \\
+#     y(k \Delta) &= \boldsymbol{C}x(k \Delta) + \boldsymbol{D}u(k \Delta)
+#   \end{aligned}
+# $$
+
+# Solving this system is a simple matter of solving the original ODE and plugging in the results. For solving the
+# original SSM equation, [here's a nice reference](https://faculty.washington.edu/chx/teaching/me547/1-7_ss_sol.pdf).
+# Then, [this resource provides a nice derivation of the discrete time SSM components](https://users.wpi.edu/~zli11/teaching/rbe595_2017/LectureSlide_PDF/discretization.pdf).
+
+# The punchline of the above derivation is that we can rewrite our SSM -- similar to how we rewrote our SSM for the
+# original S4 -- as the following (from the DSS paper):
+
+# > Assuming $A$ is non-singular, for a given sample time $\Delta \in \R_{> 0}$, the discretization of a state space is
+# > defined as a sequence-to-sequence map from $(u_0,\ldots,u_{L-1}) = u \in \R^L$ to $(y_0,\ldots,y_{L-1}) = y \in \R^L$
+# > where,
+
+# $$
+#   \begin{aligned}
+#       &x_k = \bar{A}x_{k-1} + \bar{B}u_k\ \ \ ,\ \ \ y_k = \bar{C}x_k  \\[10pt]
+#       &\bar{A} = e^{A\Delta}\ \;,\ \bar{B} = (\bar{A} - I)A^{-1}B\ ,\ \;\bar{C} = C\ .
+#   \end{aligned}
+# $$
+
+# Why is this better than the original parameterization of $\boldsymbol{\overline{A}}$ from the original S4 work? In
+# the next section, we'll see how we can derive the SSM kernel using this parameterization with simpler restrictions on
+# the structure of $\boldsymbol{\overline{A}}$, allowing for a *simple, straightforward* implementation without losing
+# much in the way of performance!
+
 # ### Rewriting the SSM Kernel
-# - Pull in equation from Part 1 for the kernel.
-# - Note repeated multiplication by A (matrix power)
-# - Time complexity of matrix power sucks!
-# - Unless... *diagonalization*
 #
+# **[TODO figure out cross-page links]**
+#
+# Part 1 of this post showed that the above discretized state-space model can be treated as a *sequence-to-sequence* map,
+# behaving a lot like an RNN with a transition matrix given by $\boldsymbol{\overline{A}}$:
+
+# $$
+# \begin{aligned}
+#   x_{k} &= \boldsymbol{\overline{A}} x_{k-1} + \boldsymbol{\overline{B}} u_k\\
+#   y_k &= \boldsymbol{\overline{C}} x_k \\
+# \end{aligned}
+# $$
+
+# We then showed how we can turn the above recurrence into a *convolution* given the repetitive structure! We end up with
+# the kernel:
+
+# $$
+# \begin{aligned}
+#     y_k &= \boldsymbol{\overline{C}} \boldsymbol{\overline{A}}^k \boldsymbol{\overline{B}} u_0 + \boldsymbol{\overline{C}} \boldsymbol{\overline{A}}^{k-1} \boldsymbol{\overline{B}} u_1 + \dots + \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_{k-1} + \boldsymbol{\overline{C}}\boldsymbol{\overline{B}} u_k
+#     \\
+#     y &= \boldsymbol{\overline{K}} \ast u
+# \end{aligned}
+# $$
+
+# $$
+# \begin{aligned}
+#   \boldsymbol{\overline{K}} \in \mathbb{R}^L  = (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}})
+# \end{aligned}
+# $$
+
+# **Problem**: Unfortunately, just like with the original S4 paper, computing this kernel $\boldsymbol{\overline{K}}$ is
+# **prohibitively expensive** (successive matrix powers of $\boldsymbol{\overline{A}}$ which blows up assuming
+# $\mathcal{O}(d^3)$ matrix multiplication, where $d$ is the dimensionality of $\boldsymbol{\overline{A}}$). Getting SSMs
+# to scale requires finding an *alternative path* to computing this kernel – one that is both efficient and that doesn't
+# badly restrict the expressivity of $\boldsymbol{\overline{A}}$. So how can we address this?
+
+
 # ### Diagonalization & Efficient Matrix Powers
-# - If we can find a way to write $\bar{\boldsymbol{A}}$ as a diagonal matrix, the matrix power defining the kernel
-# becomes *trivial*.
-# - How?
 
+# This is the key "fork in the road" between the original S4 paper, and this post's Diagonal State Spaces paper. Notably,
+# where the S4 paper is rooted in HiPPO theory and steps through some complex math (and complex code!) to make computing
+# the kernel $\boldsymbol{\overline{K}}$ efficient, the DSS is going to make a single assumption: let
+# $\boldsymbol{\overline{A}}$ be *diagonalizable*.
+#
+# Doing so turns an expensive $\mathcal{O}(d^3)$ matrix multiply into a near-linear time operation, one that is
+# conducive to performing matrix powers super fast! How we can write and initialize $\boldsymbol{\overline{A}}$ in
+# this way, and produce an update rule that ensure stable learning is the focus of the next section.
+
+# ## Part II. Deriving Diagonal State Spaces
+#
+# As a brief sketch, the DSS paper shows  that we simply need to break $\boldsymbol{\overline{A}}$ into a collection
+# of diagonal terms $\Lambda = \lambda_1 \ldots \lambda_n$; then with some straightforward algebra, we can compute an
+# efficient expression for our kernel $\boldsymbol{\overline{K}}$.
+#
+# We present this derivation (effectively Proposition 1 of the DSS paper) with light annotation below.
 
-# ## Step 2. Deriving the Diagonal State Space Model
-# Given the benefits of diagonalization, how do we construct a diagonal $\bar{\boldsymbol{A}}$ that leads to efficient
-# computation of the SSM kernel $\bar{\boldsymbol{K}}$?
+# ### The Annotated Proposition 1
 #
-# ### Proposition 1 from the DSS Paper
-# - Step through original proposition
-# - Step through proof in Appendix (simplified)
+# Recall our expanded kernel $\boldsymbol{\overline{K}}$:
+
+# $$
+# \begin{aligned}
+#   \boldsymbol{\overline{K}} \in \mathbb{R}^L  = (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}})
+# \end{aligned}
+# $$
+
+# Proposition 1 defines an efficient expression for computing $\boldsymbol{\overline{K}}$:
 #
-# ### Secret Sauce 1: Complex Softmax
-# - Part of the reason this initialization works is because we're initializing our diagonal matrix $\Lambda$ in Complex
-# space.
-# - This means that our typical softmax stops behaving well... so we need to fix it!
+# > **Proposition:** Let $\boldsymbol{\overline{K}} \in \R^{1\times L}$ be the kernel for a state space
+# > $(\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}})$ and sample time $\Delta > 0$.
 #
-# ### Secret Sauce 2: Initializing with the HiPPO Matrix
-# - Stability is still tricky
-# - HiPPO theory is still necessary (at the beginning) for initializing our weights.
+# > If $\boldsymbol{\overline{A}} \in \mathbb{C}^{N \times N}$ is diagonalizable over $\mathbb{C}$ with eigenvalues
+# > $\lambda_1,\ldots,\lambda_N$ such that, $\forall i$, $\lambda_i \neq 0$ and $e^{L\lambda_i\Delta} \neq 1$,
+# > then $\exists w \in \mathbb{C}^{1 \times N}$ such that:$\\[2pt]$
+# $$
+# \begin{aligned}
+#   \bar{K} = w \cdot \Lambda^{-1} \cdot \mathrm{row}{\text -}\mathrm{softmax}(P_{N\times L})
+# \end{aligned}
+# $$
+# > where $P_{i,k} = \lambda_i k\Delta$, and $\Lambda$ is diagonal matrix of $\lambda_1,\ldots,\lambda_N$.
 
+# Plainly, there are three parts to this proposition:
+#   1. Given we can diagonalize $\boldsymbol{\overline{A}}$, we'll store its diagonal components $\lambda_1 \ldots
+#   \lambda_n$ in $\Lambda$.
+#   2. The learned term $w$ is going to store some aggregate information of our other state space matrices
+#   $\boldsymbol{\overline{B}}$, $\boldsymbol{\overline{C}}$. We'll show how this happens in the proof below.
+#   3. Finally, given this particular structure of $\boldsymbol{\overline{A}}$, we can write the full kernel
+#   $\boldsymbol{\overline{K}}$ as the product of the inverse of $\Lambda$, this aggregate term $w$, as well as a
+#   separate softmax term $P$ that encodes some sequence positional information, blended with our diagonal terms
+#   $\Lambda$.
 
-# ## Step 3. Putting the DSS Layer Together
-# Mostly just define the DSS Layer and DSSInit function, as well as the final test.
+# Put another way – working out the math for the DSS formulation of the state space model **lets us write the kernel
+# as a simple product of some diagonal terms, a learned vector $w$, and a easy-to-formulate position matrix $P$**.
 #
-# ### Limitations
-# - RNN Autoregressive Usage (still being worked out)
-# - Still not as performant as S4 in certain settings (expressivity)
-# - Still tied to HiPPO theory
+# Let's derive this!
 
+# > **Proof:** Let $A$ be diagonalizable over $\mathbb{C}$ as $A = V \Lambda V^{-1}$ with eigenvalues
+# > $\lambda_1,\ldots, \lambda_N \in \mathbb{C}$. From the above expression of the SSM kernel we have:
+# $$
+# \begin{aligned}
+#   \boldsymbol{\overline{K}} &= (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}},
+#   \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}},\ldots,
+#   \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}})
+# \end{aligned}
+# $$
+# > where,
+# $$
+# \begin{aligned}
+#   \boldsymbol{\overline{K}}_k &= \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^k\boldsymbol{\overline{B}}
+#   = C e^{A\cdot k\Delta} (e^{A\Delta} - I)A^{-1}B \\
+#   &= (CV) e^{\Lambda k\Delta}(e^{\Lambda\Delta} - I)\Lambda^{-1} (V^{-1}B)
+# \end{aligned}
+# $$
+# > For $CV \in \mathbb{C}^{1 \times N}$ and $V^{-1}B \in \mathbb{C}^{N \times 1}$ let
+# > $(CV)^\top * (V^{-1}B) = \widetilde{w} \in \mathbb{C}^N$ be the element-wise product of $CV$ and $V^{-1}B$. Then,
+# $$
+# \begin{aligned}
+#   \boldsymbol{\overline{K}}_k &= \sum_{i=1}^N {e^{\lambda_i k\Delta}(e^{\lambda_i\Delta} - 1) \over \lambda_i} \cdot \widetilde{w}_i \\[2pt]
+#   &= \sum_{i=1}^N {e^{\lambda_i k\Delta}(e^{\lambda_i\Delta} - 1) \over \lambda_i(e^{L\lambda_i\Delta} - 1)} \cdot (\widetilde{w}_i \cdot (e^{L\lambda_i\Delta} - 1)) \\[2pt]
+#   &= \sum_{i=1}^N (\widetilde{w}_i \cdot (e^{L\lambda_i\Delta} - 1))\cdot \frac{1}{\lambda_i} \cdot {e^{\lambda_i k\Delta} \over \sum_{r=0}^{L-1} e^{r\lambda_i\Delta}}
+# \end{aligned}
+# $$
+# > where the last equality follows from $(z^L-1) = (z-1)(z^0+\ldots+z^{L-1})$ and using $z^L \neq 1$.
+# >
+# > Let $P \in \mathbb{C}^{N \times L}$ be the matrix $P_{i,k} = \lambda_i \cdot k\Delta$ and
+# > $S = \mathrm{row}{\text -}\mathrm{softmax}(P)$ denote the matrix obtained after applying $\mathrm{softmax}$ on
+# > the rows of $P$, i.e.
+# $$
+# \begin{aligned}
+#   S_{i,k} = {e^{\lambda_i k\Delta} \over \sum_{r=0}^{L-1} e^{r\lambda_i\Delta}}
+# \end{aligned}
+# $$
+# >
+# > Let $w \in \mathbb{C}^N$ be defined as $$w_i = \widetilde{w}_i \cdot (e^{L\lambda_i\Delta} - 1).$$
+# >
+# > Then, plugging in each of the above definitions into the expression for \boldsymbol{\overline{K}}_k above, we get:
+# $$
+# \begin{aligned}
+#   \boldsymbol{\overline{K}}_k &= \sum_{i=1}^N (\widetilde{w}_i \cdot (e^{L\lambda_i\Delta} - 1))\cdot \frac{1}{\lambda_i} \cdot {e^{\lambda_i k\Delta} \over \sum_{r=0}^{L-1} e^{r\lambda_i\Delta}} \\[2pt]
+#   &= \sum_{i=1}^N w_i \cdot \frac{1}{\lambda_i} \cdot S_{i, k} \\[2pt]
+#   &= w \cdot \Lambda^{-1} \cdot \mathrm{row}{\text -}\mathrm{softmax}(P_{N\times L})
+# \end{aligned}
+# $$
+# > completing the proof.
 
+# Computing the kernel in this way (collapsing the $\boldsymbol{\overline{B}}$ and $\boldsymbol{\overline{C}}$ terms into
+# $w$ has advantages for the complexity of computing the kernel and running the discrete convolution. Namely,
+# > For batch size $B$, sequence length $L$ and hidden size $H$, the DSS layer requires $O(NHL)$ time and space to
+# > compute the kernels, $O(BHL\log(L))$ time for the discrete convolution and $O(BH^2L)$ time for the output projection.
 
-## TODO -- Need to weave these parts through the sections above...
+# More importantly, implementing the DSS kernel is *very* straightforward:
 
 def complex_softmax(x, eps=1e-7):
     def reciprocal(x):
@@ -128,6 +296,7 @@ def reciprocal(x):
     e = np.exp(x2)
     return e * reciprocal(np.sum(e))
 
+
 def dss_kernel(W, Lambda, L, step):
     P = (step * Lambda)[:, None] * np.arange(L)
     S = jax.vmap(complex_softmax)(P)
@@ -137,13 +306,127 @@ def dss_kernel(W, Lambda, L, step):
 def dss_ssm(W, Lambda, L, step):
     N = Lambda.shape[0]
     Abar = np.diag(np.exp(Lambda * step))
-    b = jax.vmap(lambda l:
-                 1 / (l * (np.exp(l * np.arange(L) * step)).sum()))
+    b = jax.vmap(lambda l: 1 / (l * (np.exp(l * np.arange(L) * step)).sum()))
     Bbar = b(Lambda).reshape(N, 1)
     Cbar = W.reshape(1, N)
     return Abar, Bbar, Cbar
 
 
+def test_conversion(N=8, L=16):
+    """Test the equivalence of the DSS kernel with the generic SSM kernel."""
+    step = 1.0 / L
+    W = lecun_normal()(rng, (1, N, 2))
+    W = W[..., 0] + 1j * W[..., 1]
+    _, Lambda, _, _, _ = s4.make_NPLR_HiPPO(2 * N)
+    Lambda = Lambda[np.nonzero(Lambda.imag > 0, size=N)]
+
+    K = dss_kernel(W, Lambda, L, step)
+    ssm = dss_ssm(W, Lambda, L, step)
+
+    # Apply CNN
+    u = np.arange(L) * 1.0
+    y1 = s4.non_circular_convolution(u, K.real)
+
+    # Apply RNN
+    _, y2 = s4.scan_SSM(
+        *ssm, u[:, np.newaxis], np.zeros((N,)).astype(np.complex64)
+    )
+    assert np.allclose(y1, y2.reshape(-1).real, atol=1e-4, rtol=1e-4)
+
+
+test_conversion()
+
+
+# ### Secret Sauce – Part 1: Complex Softmax
+#
+# While the implementation above is pretty concise, there are some subtle gotchas that need to be addresed. First is the
+# computation of the special $\mathrm{row}{\text -}\mathrm{softmax}()$ function.
+#
+# Note that with the given derivation, many of the state space matrices are defined over *complex* space! The
+# traditional softmax function we've come to know and love has some problems operating in complex space – for example,
+# consider the complex $\mathrm{softmax}(0, \pi i)$; taking the naive softmax results in division by zero,
+# as the denominator is $e^{0} + e^{\pi i} = 1 - 1 = 0$!
+#
+# To correct for this, the DSS paper defines a slight correction to the softmax function, to ensure stability:
+#
+# > As noted above, $\mathrm{softmax}$ can have singularities over $\mathbb{C}$. To address this issue, we use a simple
+# > correction to make it well-defined over the entire domain:
+# >
+# > $\mathrm{softmax}$:
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Given $(x_0,\ldots,x_{L-1}) = x \in \mathbb{C}^L$,
+# > let $\mathrm{softmax}(x) \in \mathbb{C}^L$ be defined as:
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$(\mathrm{softmax}(x))_k = e^{x_k} (e^{x_0} + \ldots +
+# e^{x_{L-1}})^{-1}.$
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Note that for any $c \in \mathbb{C}$, $\mathrm{softmax}(x_0,\ldots,x_{L-1})$ $=$
+#   $\mathrm{softmax}(x_0-c,\ldots,x_{L-1}-c)$.
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unlike over $\mathbb{R}$, $\mathrm{softmax}$ can have singularities over $\mathbb{C}$ as sum of
+#   exponentials can vanish.
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;E.g. $e^{0} + e^{i\pi} = 0$ and hence $\mathrm{softmax}(0,i\pi)$ is not defined.
+# >
+# > $\mathrm{max}$:
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Given $(x_0,\ldots,x_{L-1}) = x \in \mathbb{C}^L$, let
+#   $\mathrm{max}(x)$ be the $x_i$ with the maximum real part,
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;i.e. $x_{\mathrm{argmax}_i \mathrm{Re}(x_i)}$.
+# >
+# > $\mathrm{reciprocal}_\epsilon$:
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Given $x \in \mathbb{C}$ and $\epsilon \in \R_{> 0}$, let
+#   $\mathrm{reciprocal}_\epsilon(x) = \frac{\overline{x}}{x\cdot \overline{x} + \epsilon}$ where $\overline{x}$ is
+#   the complex conjugate of $x$.
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;The denominator is always in $\R_{\geq \epsilon}$ and
+#   $|\mathrm{reciprocal}_\epsilon| \leq (2\sqrt{\epsilon})^{-1}$.
+# >
+# > $\mathrm{softmax}_\epsilon$:
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Given $(x_0,\ldots,x_{L-1}) = x \in \mathbb{C}^L$ let
+#   $m = \mathrm{max}(x)$ and $\widetilde{x}_i = x_i - m$. Note that $|e^{\widetilde{x}_i}| \leq 1$.
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Given $\epsilon \in \R_{> 0}$, let $\mathrm{softmax}_\epsilon(x)
+#   \in \mathbb{C}^L$ be:
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$$(\mathrm{softmax}_\epsilon(x))_k = e^{\widetilde{x}_k}
+#   \cdot\mathrm{reciprocal}_\epsilon\left(\sum_{r=0}^{L-1}  e^{\widetilde{x}_r}\right)$$
+# >
+# > &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$\mathrm{softmax}_\epsilon$ is always bounded and differentiable.
+# >
+# > In the DSS implementation, we use $\mathrm{softmax}_\epsilon$ with $\epsilon = 10^{-7}$.
+
+# As a punchline – to stabilize the softmax to work over $\mathbb{C}$, we write a new $\mathrm{softmax}_\epsilon$
+# where we first adjust each element by subtracting out the max real component, then reformulate the reciprocal
+# (denominator) in the traditional computation to always output a real number (by multiplying by the complex conjugate).
+#
+# ### Secret Sauce – Part 2: Initializing with the HiPPO Matrix
+#
+# One other sticking point you might notice in the above code is in *how we initialize the diagonal values $\Lambda$. In
+# order to ensure stability during training, we *must* initialize our $\Lambda$ subject to the HiPPO initialization from
+# the S4 paper and prior work.
+#
+# The reasoning for this is mostly due to stability; repeated matrix powers of $\boldsymbol{\overline{A}}$ still need to
+# be of low condition number such that the kernel doesn't explode. HiPPO theory gives us a solid grounding and a
+# reasonable initialization to use, at minimal cost (it's a fixed initialization to use at the beginning of training!).
+
+
+# ## Step III. Putting the DSS Layer Together
+#
+# Now that we've defined all the requisite pieces – the simplified expression for the kernel
+# $\boldsymbol{\overline{K}}$, the corrected $\mathrm{softmax}$ function, and the initialization for $\Lambda$,
+# we're ready to put the DSS layer together!
+
+
+def DSSLayerInit(N):
+    _, Lambda, _, _, _ = s4.make_NPLR_HiPPO(2 * N)
+    Lambda = Lambda[np.nonzero(Lambda.imag > 0, size=N)]
+    return partial(DSSLayer, N=N, Lambda=Lambda)
+
+
 class DSSLayer(nn.Module):
     Lambda: np.DeviceArray
     N: int
@@ -161,8 +444,7 @@ def setup(self):
         if not self.decode:
             self.K = dss_kernel(self.W, self.Lambda, self.l_max, self.step)
         else:
-            # FLAX code to ensure that we only compute discrete once
-            # during decoding.
+            # FLAX code to ensure that we only compute discrete once during decoding.
             def init_discrete():
                 return dss_ssm(self.W, self.Lambda, self.l_max, self.step)
             ssm_var = self.variable("prime", "ssm", init_discrete)
@@ -188,29 +470,12 @@ def __call__(self, u):
 DSSLayer = s4.cloneLayer(DSSLayer)
 
 
-def DSSLayerInit(N):
-    _, Lambda, _, _, _ = s4.make_NPLR_HiPPO(2 * N)
-    Lambda = Lambda[np.nonzero(Lambda.imag > 0, size=N)]
-    return partial(DSSLayer, N=N, Lambda=Lambda)
-
-
-def test_conversion(N=8, L=16):
-    """Maybe this a general test?"""
-    step = 1.0 / L
-    W = lecun_normal()(rng, (1, N, 2))
-    W = W[..., 0] + 1j * W[..., 1]
-    _, Lambda, _, _, _ = s4.make_NPLR_HiPPO(2 * N)
-    Lambda = Lambda[np.nonzero(Lambda.imag > 0, size=N)]
-
-    K = dss_kernel(W, Lambda, L, step)
-    ssm = dss_ssm(W, Lambda, L, step)
-
-    # Apply CNN
-    u = np.arange(L) * 1.0
-    y1 = s4.non_circular_convolution(u, K.real)
-
-    # Apply RNN
-    _, y2 = s4.scan_SSM(
-        *ssm, u[:, np.newaxis], np.zeros((N,)).astype(np.complex64)
-    )
-    assert np.allclose(y1, y2.reshape(-1).real, atol=1e-4, rtol=1e-4)
+# The core of the DSS layer is the same as the traditional SSM layer defined in the first part of the post. We define
+# the initializer, define our learnable weights $w$ then call the kernel code written above as a convolution during
+# training.
+#
+# Finally, during discrete decoding, we use the initial recurrence computed above.
+#
+# ... and that's all folks! DSS is not only more compact (< 100 LoC) than S4, but at it's core is a simple idea:
+# diagonalization allows one to efficiently compute matrix powers, and we can use that insight to build a kernel that
+# is almost as expressive and just as performant as S4.