lower_bound.lyx

#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\textclass optbook
\end_header
\begin_body

\begin_layout Section
Lower Bounds
\begin_inset CommandInset label
LatexCommand label
name "sec:Lower-Bounds"

\end_inset


\begin_inset Note Note
status open

\begin_layout Plain Layout
checked calculations
\end_layout

\end_inset


\end_layout

\begin_layout Standard
In this chapter, we have discussed a few cutting plane methods.
 In particular, we showed that 
\begin_inset Formula $O(\min(n,\sqrt{\frac{L}{\mu}})\log(\frac{1}{\epsilon}))$
\end_inset

 gradient computations suffice.
 We conclude this section by showing that 
\begin_inset Formula $\min(n,\sqrt{\frac{L}{\mu}})$
\end_inset

 is in fact optimal among 
\begin_inset Quotes eld
\end_inset

gradient-based
\begin_inset Quotes erd
\end_inset

 methods.
 For convenience the reader may consider 
\begin_inset Formula $\mu=1$
\end_inset

 and 
\begin_inset Formula $L=\kappa$
\end_inset

.
\end_layout

\begin_layout Theorem
Fix any 
\begin_inset Formula $L\geq\mu>0$
\end_inset

.
 Consider the function
\begin_inset Formula 
\begin{equation}
f(x)=-\frac{L-\mu}{4}x_{1}+\frac{L-\mu}{8}\sum_{i=1}^{n-1}(x_{i}-x_{i+1})^{2}+\frac{L-\mu}{8}x_{1}^{2}+\frac{\mu}{2}\sum_{i=1}^{n}x_{i}^{2}+\frac{\sqrt{L\mu}-\mu}{4}x_{n}^{2}\label{eq:worst_function}
\end{equation}

\end_inset


\end_layout

\begin_layout Theorem
which satisfies 
\begin_inset Formula $\mu\cdot I\preceq\nabla^{2}f(x)\preceq L\cdot I$
\end_inset

.
 Assume that our algorithm satisfies 
\begin_inset Formula $x^{(k)}\in\mathrm{span}(x^{(0)},\nabla f(x^{(0)}),\cdots,\nabla f(x^{(k-1)}))$
\end_inset

 with the initial point 
\begin_inset Formula $x^{(0)}=0$
\end_inset

.
 Then, for 
\begin_inset Formula $k<n$
\end_inset

,
\begin_inset Formula 
\[
f(x^{(k)})-\min_{x}f(x)\geq(\frac{\mu}{L})^{3/2}(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{2k}(f(x^{(0)})-\min_{x}f(x)).
\]

\end_inset


\end_layout

\begin_layout Proof
First, we check the strong convexity and smoothness.
 Note that 
\begin_inset Formula $f(x)=-x_{1}+\frac{1}{2}x^{\top}Ax$
\end_inset

 for some matrix 
\begin_inset Formula $A$
\end_inset

.
 Hence, 
\begin_inset Formula $\nabla^{2}f(x)=A$
\end_inset

.
 Hence, we have
\begin_inset Formula 
\[
\theta^{\top}\nabla^{2}f(x)\theta=\frac{L-\mu}{4}\sum_{i=1}^{n-1}(\theta_{i}-\theta_{i+1})^{2}+\frac{L-\mu}{4}\theta_{1}^{2}+\mu\sum_{i=1}^{n}\theta_{i}^{2}+\frac{\sqrt{L\mu}-\mu}{2}\theta_{n}^{2}
\]

\end_inset

For the upper bound 
\begin_inset Formula $\nabla^{2}f(x)\preceq L\cdot I$
\end_inset

, we note that 
\begin_inset Formula 
\begin{align*}
\theta^{\top}\nabla^{2}f(x)\theta & \leq\frac{L-\mu}{4}\sum_{i=1}^{n-1}(2\theta_{i}^{2}+2\theta_{i+1}^{2})+\frac{L-\mu}{4}\theta_{1}^{2}+\mu\sum_{i=1}^{n}\theta_{i}^{2}+(\frac{\sqrt{L\mu}-\mu}{2})\theta_{n}^{2}\\
 & \leq(L-\mu)\sum_{i=1}^{n}\theta_{i}^{2}+\mu\sum_{i=1}^{n}\theta_{i}^{2}\\
 & =L\sum_{i=1}^{n}\theta_{i}^{2}.
\end{align*}

\end_inset

For the lower bound 
\begin_inset Formula $\nabla^{2}f(x)\succeq\mu\cdot I$
\end_inset

, we note that
\begin_inset Formula 
\[
\theta^{\top}\nabla^{2}f(x)\theta\geq\mu\sum_{i=1}^{n}\theta_{i}^{2}.
\]

\end_inset


\end_layout

\begin_layout Proof
To lower bound the error, we note that the gradient at 
\begin_inset Formula $x^{(0)}$
\end_inset

 is of the form 
\begin_inset Formula $(?,0,0,0,\cdots)$
\end_inset

 and hence by the assumption 
\begin_inset Formula $x^{(1)}=(?,0,0,0,\cdots)$
\end_inset

 and 
\begin_inset Formula $\nabla f(x^{(1)})=(?,?,0,0,\cdots)$
\end_inset

.
 By induction, only the first 
\begin_inset Formula $k$
\end_inset

 coordinates of 
\begin_inset Formula $x^{(k)}$
\end_inset

 are non-zero.
\end_layout

\begin_layout Proof
Now, we compute the minimizer of 
\begin_inset Formula $f(x)$
\end_inset

.
 Let 
\begin_inset Formula $x^{*}$
\end_inset

 be the minimizer of 
\begin_inset Formula $f(x)$
\end_inset

.
 By the optimality condition, we have that
\begin_inset Formula 
\begin{align*}
-\frac{L-\mu}{4}+\frac{L-\mu}{4}(x_{1}^{*}-x_{2}^{*})+\frac{L-\mu}{4}x_{1}^{*}+\mu x_{1}^{*} & =0,\\
\frac{L-\mu}{4}(x_{i}^{*}-x_{i-1}^{*})+\frac{L-\mu}{4}(x_{i}^{*}-x_{i+1}^{*})+\mu x_{i}^{*} & =0,\text{ for }i\in\{2,3,\cdots,n-1\}\\
\frac{L-\mu}{4}(x_{n}^{*}-x_{n-1}^{*})+\frac{\sqrt{L\mu}+\mu}{2}x_{n}^{*} & =0.
\end{align*}

\end_inset

By a direct substitution, we have that 
\begin_inset Formula $x_{i}^{*}=(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{i}$
\end_inset

 is a solution of the above equation.
 Now, we note that
\begin_inset Formula 
\begin{align*}
\|x^{(k)}-x^{*}\|_{2}^{2} & \geq\sum_{i=k+1}^{n}(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{2i}\geq(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{2(k+1)}
\end{align*}

\end_inset

and that
\begin_inset Formula 
\[
\|x^{(0)}-x^{*}\|_{2}^{2}\leq\sum_{i=1}^{\infty}(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{2i}=(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{2}\frac{\sqrt{L/\mu}+1}{2}.
\]

\end_inset

Now, by smoothness and by the strong convexity of 
\begin_inset Formula $f$
\end_inset

, we have
\begin_inset Formula 
\[
\frac{f(x^{(k)})-f(x^{*})}{f(x^{(0)})-f(x^{*})}\geq\frac{\mu}{L}\cdot\frac{\|x^{(k)}-x^{*}\|_{2}^{2}}{\|x^{(0)}-x^{*}\|_{2}^{2}}\geq(\frac{\mu}{L})^{3/2}(\frac{\sqrt{L/\mu}-1}{\sqrt{L/\mu}+1})^{2k}.
\]

\end_inset


\end_layout

\begin_layout Standard
Note that this 
\begin_inset Quotes eld
\end_inset

worst
\begin_inset Quotes erd
\end_inset

 function naturally appears in many problems.
 So, it is a problem we need to address.
 In some sense, the proof points out a common issue of any algorithm which
 only uses gradient information.
 Given any convex function, we construct the dependence graph 
\begin_inset Formula $G$
\end_inset

 on the set of variables 
\begin_inset Formula $x_{i}$
\end_inset

 by connecting 
\begin_inset Formula $x_{i}$
\end_inset

 to 
\begin_inset Formula $x_{j}$
\end_inset

 if 
\begin_inset Formula $\nabla f(x)_{i}$
\end_inset

 depends on 
\begin_inset Formula $x_{j}$
\end_inset

 or 
\begin_inset Formula $\nabla f(x)_{j}$
\end_inset

 depends on 
\begin_inset Formula $x_{i}$
\end_inset

 (given all other variables).
 Note that the dependence graph 
\begin_inset Formula $G$
\end_inset

 of the worst function is simply a 
\begin_inset Formula $n$
\end_inset

 vertex path, whose diameter is 
\begin_inset Formula $n-1$
\end_inset

.
 Also, note that gradient descent can only transmit information from one
 vertex to another in each iteration.
 Therefore, it takes at least 
\begin_inset Formula $\Omega(\mathrm{diameter})$
\end_inset

 time to solve the problem unless we know the solution is sparse (when 
\begin_inset Formula $L/\mu$
\end_inset

 is small).
 However, we note that this is not a lower bound for all algorithms.
\end_layout

\begin_layout Standard
The problem (
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:worst_function"

\end_inset

) belongs to a general class of functions called Laplacian systems and it
 can be solved in nearly linear time using spectral graph theory.
 
\begin_inset Note Note
status open

\begin_layout Plain Layout
TODO: add citations.
\end_layout

\end_inset


\end_layout

\end_body
\end_document