-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathem-vs-vi.tex
101 lines (98 loc) · 2.59 KB
/
em-vs-vi.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
\frame{\frametitle{"Stochastizing" EM}
Recall from Liang and Klein (2009):
\begin{columns}
\begin{column}{.5\textwidth}
\begin{block}{}
\uncover<1-2>{
\footnotesize
\begin{enumerate}
\item[] $\mu = \text{initialization}$
\item[] for $t = 1, \ldots, T$:
\begin{enumerate}
\item[] for $i \in [n]$:
\begin{enumerate}
\item[] $s_i' = \sum_z p(z | x^{(i)}; \theta(\mu)) \phi(x^{(i)}, z)$
\item[] $\mu'_t \alt<2>{\alert{+=}}{+=} s_i'$
\end{enumerate}
\item[] $\mu = \mu'_t$
\end{enumerate}
\end{enumerate}
}
\end{block}
\end{column}
\begin{column}{.5\textwidth}
\begin{block}{}
\uncover<2>{
\footnotesize
\begin{enumerate}
\item[] $\mu = \text{initialization}$; $k = 0$
\item[] for $t = 1, \ldots, T$:
\begin{enumerate}
\item[] for $i \in [n]$ \alert{randomly}:
\begin{enumerate}
\item[] $s_i' = \sum_z p(z | x^{(i)}; \theta(\mu)) \phi(x^{(i)}, z)$
\item[] $\mu \alert{=} (1 - \rho_k) \mu + \rho_k s_i'$
\item[] $k += 1$
\end{enumerate}
\end{enumerate}
\end{enumerate}
}
\end{block}
\end{column}
\end{columns}
}
\frame{\frametitle{Stochastic Variational Inference: Main Idea}
Main points:
\begin{enumerate}
\item Sample, uniformly at random, a \textit{single} data point $x$
\item Assume $x$ was seen many ($N$) times
\item Form the \gvp{\text{\textit{initial} global}} updates
\item Interpolate these temporary updates with the current parameters
\end{enumerate}
\uncover<2->{
What about optimizing \lvp{\text{local}} variational parameters?
}
}
\frame{\frametitle{"Stochastizing" Variational Inference}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{block}{}
\uncover<1-2>{
\footnotesize
\begin{enumerate}
\item[] $\gvp{\lambda^{(0)}}= \text{initialization}$
\item[]
\item[] repeat:
\begin{enumerate}
\item[] for local $\lvp{\phi_{n,j}}$:
\begin{enumerate}
\item[] $\lvp{\phi_{n,j}},\lvp{\phi_{n,j}^{(t)}} = \mathbb{E}_{q^{(t-1)}}[\eta_{l,j}(x_n,z_{n\backslash j}, \beta)]$
\end{enumerate}
\item[] $\gvp{\lambda^{(t)}} = \mathbb{E}_{q^{(t)}}[\eta_g(z,x)]$
\end{enumerate}
\item[] until ELBO converges
\end{enumerate}
}
\end{block}
\end{column}
\begin{column}{.5\textwidth}
\begin{block}{}
\uncover<2>{
\footnotesize
\begin{enumerate}
\item[] $\gvp{\lambda^{(0)}}= \text{initialization}$
\item[] set step-size schedule, $\rho_t$
\item[] repeat:
\begin{itemize}
\item[] sample data point $x_i$ u.a.r.
\item $\lvp{\phi(\gvp{\lambda})} = \mathbb{E}_{\lambda^{(t-1)}}[\eta_g(x_n,z_{n\backslash j}, \beta)]$
\item $\gvp{\hat\lambda} = \mathbb{E}_{\lvp{\phi}} [ \eta_g(x_i^{(N)},z_i^{(N)})] $
\item $\gvp{\lambda^{(t)}} = (1-\rho_t)\lambda^{(t-1)} + \rho_t \hat\lambda$
\end{itemize}
\item[] forever
\end{enumerate}
}
\end{block}
\end{column}
\end{columns}
}