Skip to content

Commit

Permalink
Slides (cont.)
Browse files Browse the repository at this point in the history
  • Loading branch information
glouppe committed Oct 1, 2014
1 parent 78169c7 commit 9c2f9dd
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 26 deletions.
Binary file modified slides/figures/rp-memory.pdf
Binary file not shown.
79 changes: 53 additions & 26 deletions slides/slides.tex
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ \section{Motivation}
\end{frame}
}

\section{Growing decision trees}
\section{Growing decision trees and random forests}

\begin{frame}{Supervised learning}

Expand Down Expand Up @@ -209,7 +209,7 @@ \section{Growing decision trees}

\begin{algorithmic}
\Function{BuildDecisionTree}{${\cal L}$}
\State Create node $t$
\State Create node $t$ from the learning sample ${\cal L}_t = {\cal L}$
\If{the stopping criterion is met for $t$}
\State $\widehat{y}_{t} =$ some constant value
\Else
Expand Down Expand Up @@ -266,7 +266,7 @@ \section{Growing decision trees}
\begin{frame}{Diagnosing the generalization error of a decision tree}

\begin{itemize}
\item Residual error: Lowest achievable error, independent of $\varphi_{\cal L}$.
\item (Residual error: Lowest achievable error, independent of $\varphi_{\cal L}$.)
\item Bias: Decision trees usually have {\color{blue} low bias}.
\item Variance: They often suffer from {\color{red} high variance}.
\end{itemize}
Expand Down Expand Up @@ -352,19 +352,20 @@ \section{Growing decision trees}
learning set and the random perburbations.
\end{itemize}

\begin{itemize}
\item $\rho(\mathbf{x}) \to 1$ when variance is mostly due to the learning set;
\item $\rho(\mathbf{x}) \to 0$ when variance is mostly due to the random perturbations.
\end{itemize}
\bigskip

$\rho(\mathbf{x}) \to 1$ when variance is mostly due to the learning set; \\
$\rho(\mathbf{x}) \to 0$ when variance is mostly due to the random perturbations.


\end{frame}

\begin{frame}{Diagnosing the generalization error of random forests}

\begin{itemize}
\item Residual error: Lowest achievable error, independent of $\psi_{\cal L}$.
\item Bias: {\color{blue} Identical} to the bias of a single randomized tree.
\item Variance: As $M \to \infty$, {\color{red} $\text{var}(\mathbf{x}) \to \rho(\mathbf{x}) \sigma^2_{{\cal L},\theta}(\mathbf{x})$}
\item Variance: $\text{var}(\mathbf{x}) = \rho(\mathbf{x}) \sigma^2_{{\cal L},\theta}(\mathbf{x}) + \frac{1 - \rho(\mathbf{x})}{M} \sigma^2_{{\cal L},\theta}(\mathbf{x})$\\
As $M \to \infty$, {\color{red} $\text{var}(\mathbf{x}) \to \rho(\mathbf{x}) \sigma^2_{{\cal L},\theta}(\mathbf{x})$}
\begin{itemize}
\item The stronger the randomization, $\rho(\mathbf{x}) \to 0$, $\text{var}(\mathbf{x}) \to 0$.
\item The weaker the randomization, $\rho(\mathbf{x}) \to 1$, $\text{var}(\mathbf{x}) \to \sigma^2_{{\cal L},\theta}(\mathbf{x})$
Expand Down Expand Up @@ -638,7 +639,7 @@ \section{Interpreting random forests}

\bigskip

{\color{blue}$\Rightarrow$ There can be relevant variables with zero
{\color{red}$\Rightarrow$ There can be relevant variables with zero
importances (due to masking effects).}

\medskip
Expand All @@ -653,7 +654,7 @@ \section{Interpreting random forests}

\bigskip

{\color{blue}$\Rightarrow$ The importance of relevant variables can be
{\color{red}$\Rightarrow$ The importance of relevant variables can be
influenced by the number of irrelevant variables.}

\medskip
Expand All @@ -667,7 +668,7 @@ \section{Interpreting random forests}

\begin{frame}{Bias due to empirical impurity misestimations}

For a finite learning set ${\cal L}$ of size $N$, node impurity terms $i(t)$
For a finite learning set ${\cal L}_t$ of size $N_t$, node impurity terms $i(t)$
suffer from an empirical misestimation bias.

\bigskip
Expand All @@ -679,31 +680,27 @@ \section{Interpreting random forests}
\mathbb{E}\{ \widehat{I}(X_j; Y) \} = \frac{(|{\cal X}_j|-1)(|{\cal Y}|-1)}{2 N_t \log 2}.
\end{equation*}

{\color{blue}$\Rightarrow$ Importances are biased towards variables of high
{\color{red}$\Rightarrow$ Importances are biased towards variables of high
cardinality.}

\medskip

{\color{blue}$\Rightarrow$ This effect can be minimized by collecting impurity
terms measured from large enough sample only (e.g., by stopping the
construction of the tree early).}
terms measured from large enough sample only} (e.g., by stopping the
construction of the tree early).

\end{frame}

\begin{frame}{Bias due to binary trees and threshold selection}
\begin{frame}{Bias due to binary trees}

Random forests usually make use of binary splits instead of multiway exhaustive
splits.

\bigskip

{\color{blue}$\Rightarrow$ $i(t)$ does not actually measure the mutual
{\color{red}$\Rightarrow$ $i(t)$ does not actually measure the mutual
information.}

\bigskip

{\color{blue}$\Rightarrow$ Caution should be taken when interpreting the
numerical value of importance scores.}

\end{frame}

Expand All @@ -717,15 +714,16 @@ \section{Interpreting random forests}
\end{figure}

\begin{center}
Taking into account (some of) the biases results in a different story!
Taking into account (some of) the biases\\
results in {\color{red} quite a different story}!
\end{center}

\end{frame}


% Computational performance ===================================================

\section{Computational performance}
\section{Accelerating random forests}

\begin{frame}{Computational complexity}

Expand Down Expand Up @@ -831,7 +829,7 @@ \section{Computational performance}
\begin{block}{Problem}
\begin{itemize}
\item Let assume a supervised learning problem of $N_s$ samples defined over $N_f$ features.
Let also assume $T$ computing each with a
Let also assume $T$ computing nodes, each with a
memory capacity limited to $M_{max}$, with $M_{max}\ll N_s \times N_f$.
\item How to best exploit the memory constraint to obtain the most accurate model, as quickly as possible?
\end{itemize}
Expand Down Expand Up @@ -894,13 +892,13 @@ \section{Computational performance}
useless}. The size of the random patches can be reduced without (significant)
loss in accuracy.

\item As a result, \textcolor{red}{both memory consumption and training time
\item As a result, \textcolor{blue}{both memory consumption and training time
can be reduced}, at low cost.

\item With strong memory constraints, RP can exploit data better than the
other methods.

\item \textcolor{red}{Sampling features is critical to improve accuracy.}
\item \textcolor{blue}{Sampling features is critical to improve accuracy.}
Sampling the examples only is often ineffective.
\end{itemize}

Expand All @@ -912,6 +910,35 @@ \section{Computational performance}
\section{Conclusions}

\begin{frame}{Conclusions}
\begin{itemize}

\item Random forests constitute one of the most {\color{blue} robust and
effective} machine learning algorithms for many problems.

\item While simple in design and easy to use, random forests remain however
\begin{itemize}
\item hard to analyze statistically,
\item non-trivial to interpret,
\item difficult to implement properly.
\end{itemize}

\item Through an in-depth re-assessment of the method, this dissertation has
proposed {\color{blue} original contributions} on these issues.

\end{itemize}
\end{frame}

\begin{frame}{Future works}
\begin{itemize}
\item Theoretical characterization of variable importances in a finite setting.

\item Re-analysis of empirical studies based on variable importances, in light
of the results and conclusions of the thesis.

\item Finer study of subsampling statistical mechanisms.

\end{itemize}
\end{frame}


\end{document}

0 comments on commit 9c2f9dd

Please sign in to comment.