diff --git a/slides/figures/tree-partition-d.pdf b/slides/figures/tree-partition-d.pdf new file mode 100644 index 0000000..3a4093a Binary files /dev/null and b/slides/figures/tree-partition-d.pdf differ diff --git a/slides/figures/tree-simple.pdf b/slides/figures/tree-simple.pdf index 1d88f73..5fa568f 100644 Binary files a/slides/figures/tree-simple.pdf and b/slides/figures/tree-simple.pdf differ diff --git a/slides/slides.tex b/slides/slides.tex index c96e16a..97784a4 100644 --- a/slides/slides.tex +++ b/slides/slides.tex @@ -16,6 +16,7 @@ \usepackage{eulervm} \usepackage{auto-pst-pdf} \usepackage{pst-plot} +\usepackage{multirow} \hypersetup{colorlinks=true, linkcolor=black, urlcolor=blue} @@ -54,6 +55,8 @@ \begin{document} +\renewcommand{\inserttotalframenumber}{39} + % Title page ================================================================== \begin{frame} @@ -127,6 +130,14 @@ \section{Motivation} % Supervised learning ========================================================= +\begin{frame} + \frametitle{Outline} + \tableofcontents +\end{frame} + + +\section{Growing decision trees and random forests} + \AtBeginSection[] { \begin{frame} @@ -136,8 +147,6 @@ \section{Motivation} \end{frame} } -\section{Growing decision trees and random forests} - \begin{frame}{Supervised learning} \begin{itemize} @@ -195,9 +204,20 @@ \section{Growing decision trees and random forests} \end{frame} \begin{frame}{Decision trees} +\begin{columns} +\begin{column}{0.3\textwidth} +\begin{figure} +\includegraphics[width=\textwidth]{./figures/tree-partition-d.pdf} +\end{figure} +\end{column} +\begin{column}{0.7\textwidth} \begin{figure} -\includegraphics[scale=0.5]{./figures/tree-simple.pdf} +\includegraphics[width=\textwidth]{./figures/tree-simple.pdf} \end{figure} +\end{column} +\end{columns} + +\vspace{1cm} $t \in \varphi$: nodes of the tree $\varphi$\\ $X_t$: split variable at $t$ \\ @@ -285,43 +305,28 @@ \section{Growing decision trees and random forests} \includegraphics[scale=0.5]{./figures/forest.pdf} \end{figure} -$\psi(\mathbf{x}) = \argmax_{c \in {\cal Y}} \frac{1}{M} \sum_{m=1}^M p_{\varphi_m}(Y = c | X = \mathbf{x})$ \\ -Randomization: -\begin{itemize} -\item Bagging -\item Random selection of $K \leq p$ candidate split variables -\end{itemize} -\end{frame} +Randomization -\begin{frame}{Condorcet's jury theorem} - -\begin{columns} -\begin{column}{0.65\textwidth} -Let consider a group of $M$ voters. - -\vspace{1cm} - -If each voter has an independent probability $p > \tfrac{1}{2}$ of voting for -the correct decision, then adding more voters increases the probability of the -majority decision to be correct. +\vspace{0.1cm} -\vspace{1cm} +{\scriptsize +\begin{tabular}{lll} -When $M \to \infty$, the probability that the -decision taken by the group is correct approaches $1$. - -\end{column} - -\begin{column}{0.35\textwidth} -\begin{figure} - \includegraphics[scale=1.0]{./figures/condorcet.png} -\end{figure} -\end{column} -\end{columns} +\textbullet\hspace*{0.1cm} Bootstrap samples & \multirow{2}{*}{{\LARGE \}} {\color{blue} Random Forests}} & \\ +\textbullet\hspace*{0.1cm} Random selection of $K \leq p$ split variables && \multirow{2}{*}{{\LARGE \}} {\color{blue} Extra-Trees}} \\ +\textbullet\hspace*{0.1cm} Random selection of the threshold & \\ +\end{tabular}} +% {\scriptsize +% \begin{itemize} +% \item Bootstrap samples +% \item Random selection of $K \leq p$ candidate split variables +% \item Random selection of the threshold +% \end{itemize}} \end{frame} + \begin{frame}{Bias-variance decomposition (cont.)} {\bf Theorem.} @@ -343,27 +348,6 @@ \section{Growing decision trees and random forests} \end{frame} -\begin{frame}{Interpretation of $\rho(\mathbf{x})$ {\scriptsize (Louppe, 2014)}} - -{\bf Theorem.} $\rho(\mathbf{x}) = \frac{\mathbb{V}_{\cal L} \{ \mathbb{E}_{\theta|{\cal L}} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} \}}{\mathbb{V}_{\cal L} \{ \mathbb{E}_{\theta|{\cal L}} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} \} + \mathbb{E}_{\cal L} \{ \mathbb{V}_{\theta|{\cal L}} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} \}}$ - -\vspace{1cm} - -In other words, it is the ratio between -\begin{itemize} -\item the variance due to the learning set and -\item the total variance, accounting for random effects due to both the - learning set and the random perburbations. -\end{itemize} - -\bigskip - -$\rho(\mathbf{x}) \to 1$ when variance is mostly due to the learning set; \\ -$\rho(\mathbf{x}) \to 0$ when variance is mostly due to the random perturbations;\\ -$\rho(\mathbf{x}) \geq 0$. - - -\end{frame} \begin{frame}{Diagnosing the generalization error of random forests} @@ -409,28 +393,6 @@ \section{Growing decision trees and random forests} \end{frame} -\begin{frame}{Strengths and weaknesses of random forests} - -\begin{itemize} -\item Very good accuracy -\item Universal approximation -\item Robustness to outliers -\item Robustness to irrelevant attributes (to some extent) -\item Invariance to scaling of inputs -\item Good computational efficiency and scalability - -\bigskip - -\item<2> \alert{Loss of interpretability with respect to single decision trees} -\end{itemize} - -\uncover<2>{ -\begin{figure} - \includegraphics[scale=0.5]{./figures/blackbox.jpg} -\end{figure} -} - -\end{frame} % Bias-variance =============================================================== @@ -440,6 +402,10 @@ \section{Interpreting random forests} \begin{frame}{Variable importances} +\begin{figure} + \includegraphics[scale=0.3]{./figures/blackbox.jpg} +\end{figure} + \begin{itemize} \item Interpretability can be recovered through {\color{blue}variable importances} @@ -461,7 +427,7 @@ \section{Interpreting random forests} \item It is faster to compute; \item It does not require to use bootstrap sampling; \item In practice, it correlates well with the MDA - measure (except in specific conditions). + measure. \end{itemize} \end{itemize} @@ -629,12 +595,6 @@ \section{Interpreting random forests} \vspace{1cm} -\textbf{Definition} {\scriptsize (Kohavi \& John, 1997)}. A variable $X$ is -{\it irrelevant} (to $Y$ with respect to $V$) if, for all $B\subseteq V$, -$I(X;Y|B)=0$. A variable is {\it relevant} if it is not irrelevant. - -\vspace{1cm} - {\bf Theorem.} A variable $X_j$ is irrelevant if and only if $\text{Imp}(X_j)= 0$. \vspace{1cm} @@ -642,78 +602,46 @@ \section{Interpreting random forests} {\color{blue} $\Rightarrow$ The importance of a relevant variable is insensitive to the addition or the removal of irrelevant variables.} -\end{frame} +\vspace{1cm} -\begin{frame}{Bias due to masking effects} -Most properties are lost as soon as $K>1$. +{\scriptsize \textbf{Definition} {\scriptsize (Kohavi \& John, 1997)}. A variable $X$ is +{\it irrelevant} (to $Y$ with respect to $V$) if, for all $B\subseteq V$, +$I(X;Y|B)=0$. A variable is {\it relevant} if it is not irrelevant.} -\bigskip +\end{frame} -{\color{red}$\Rightarrow$ There can be relevant variables with zero -importances (due to masking effects).} +\begin{frame}{Relaxing assumptions} -\medskip +\begin{block}{When trees are not totally random...} -{\footnotesize -Example:\\ -$I(X_1;Y)=H(Y)$, $I(X_1;Y) \approx I(X_2;Y)$, $I(X_1;Y|X_2)=\epsilon$ and $I(X_2;Y|X_1)=0$ \begin{itemize} -\item $K=1 \rightarrow \text{Imp}_{K=1}(X_1) \approx \frac{1}{2} I(X_1;Y)+\epsilon$ and $\text{Imp}_{K=1}(X_2)\approx \frac{1}{2} I(X_1;Y)$ -\item $K=2 \rightarrow \text{Imp}_{K=2}(X_1)=I(X_1;Y)$ and $\text{Imp}_{K=2}(X_2)=0$ -\end{itemize}} - -\bigskip - -{\color{red}$\Rightarrow$ The importance of relevant variables can be -influenced by the number of irrelevant variables.} +\item There can be {\color{red} relevant variables with zero +importances} (due to masking effects). +\item The importance of relevant variables can be +{\color{red} influenced by the number of irrelevant variables}. +\end{itemize} -\medskip +\end{block} -{\footnotesize +\begin{block}{When the learning set is finite...} \begin{itemize} -\item $K=2$ and we add a new irrelevant variable $X_3$ $\rightarrow$ $\text{Imp}_{K=2}(X_2)>0$ -\end{itemize}} +\item Importances are {\color{red} biased towards variables of high +cardinality}. -\end{frame} - -\begin{frame}{Bias due to empirical impurity misestimations} - -For a finite learning set ${\cal L}_t$ of size $N_t$, node impurity terms $i(t)$ -suffer from an empirical misestimation bias. - -\bigskip - -If $X_j$ and $Y$ are independent random variables, then the expected value of -the finite sample size estimates is: - -\begin{equation*} -\mathbb{E}\{ \widehat{I}(X_j; Y) \} = \frac{(|{\cal X}_j|-1)(|{\cal Y}|-1)}{2 N_t \log 2}. -\end{equation*} - -{\color{red}$\Rightarrow$ Importances are biased towards variables of high -cardinality.} - -\medskip +\item This effect can be minimized by collecting impurity +terms measured from large enough sample only. +\end{itemize} +\end{block} -{\color{blue}$\Rightarrow$ This effect can be minimized by collecting impurity -terms measured from large enough sample only} (e.g., by stopping the -construction of the tree early). +\begin{block}{When splits are not multiway...} +\begin{itemize} +\item $i(t)$ does not actually measure the mutual information. +\end{itemize} +\end{block} \end{frame} -\begin{frame}{Bias due to binary trees} - -Random forests usually make use of binary splits instead of multiway exhaustive -splits. - -\bigskip - -{\color{red}$\Rightarrow$ $i(t)$ does not actually measure the mutual -information.} - - -\end{frame} \begin{frame}{Back to our example} @@ -734,27 +662,7 @@ \section{Interpreting random forests} % Computational performance =================================================== -\section{Inplementing and accelerating random forests} - -\begin{frame}{Computational complexity {\scriptsize (Louppe, 2014)}} - -\begin{table} - \centering - \begin{tabular}{| c | c |} - \hline - & \textit{Average case} \\ - \hline - \hline - CART & $\Theta(pN\log^2 N)$ \\ - Random Forest & $\Theta(MK\widetilde{N}\log^2 \widetilde{N})$ \\ - Extra-Trees & $\Theta(MKN\log N)$ \\ - \hline - \end{tabular} - \caption{Time complexity for building forests of $M$ randomized trees. $N$ denotes the number of samples in ${\cal L}$, $p$ the number of input variables and $K$ the number of variables randomly drawn at each node. $\widetilde{N} = 0.632 N$.} - \label{table:complexity-fit} -\end{table} - -\end{frame} +\section{Implementing and accelerating random forests} \begin{frame}{Implementation {\scriptsize (Buitinck et al., 2013)}} @@ -823,6 +731,35 @@ \section{Inplementing and accelerating random forests} \end{figure} \end{frame} + +\begin{frame}{Computational complexity {\scriptsize (Louppe, 2014)}} + +\begin{table} + \centering + \begin{tabular}{| c | c |} + \hline + & \textit{Average time complexity} \\ + \hline + \hline + CART & $\Theta(pN\log^2 N)$ \\ + Random Forest & $\Theta(MK\widetilde{N}\log^2 \widetilde{N})$ \\ + Extra-Trees & $\Theta(MKN\log N)$ \\ + \hline + \end{tabular} +\end{table} + +\begin{center} + \begin{itemize} + \item $N$: number of samples in ${\cal L}$ + \item $p$: number of input variables + \item $K$: the number of variables randomly drawn at each node + \item $\widetilde{N} = 0.632 N$. + \end{itemize} +\end{center} + +\end{frame} + + \begin{frame}{Improving scalability through randomization} \begin{block}{Motivation} @@ -929,7 +866,7 @@ \section{Conclusions} \item While simple in design and easy to use, random forests remain however \begin{itemize} {\color{red} - \item hard to analyze statistically, + \item hard to analyze theoretically, \item non-trivial to interpret, \item difficult to implement properly.} \end{itemize} @@ -968,4 +905,73 @@ \section{Conclusions} \end{frame} +\appendix + +\begin{frame} +\begin{center} +{\Huge Questions?} +\end{center} +\end{frame} + + +\begin{frame} + +\begin{center} +{\it Backup slides} +\end{center} + +\end{frame} + +\begin{frame}{Condorcet's jury theorem} + +\begin{columns} +\begin{column}{0.65\textwidth} +Let consider a group of $M$ voters. + +\vspace{1cm} + +If each voter has an independent probability $p > \tfrac{1}{2}$ of voting for +the correct decision, then adding more voters increases the probability of the +majority decision to be correct. + +\vspace{1cm} + +When $M \to \infty$, the probability that the +decision taken by the group is correct approaches $1$. + +\end{column} + +\begin{column}{0.35\textwidth} +\begin{figure} + \includegraphics[scale=1.0]{./figures/condorcet.png} +\end{figure} +\end{column} +\end{columns} + +\end{frame} + + +\begin{frame}{Interpretation of $\rho(\mathbf{x})$ {\scriptsize (Louppe, 2014)}} + +{\bf Theorem.} $\rho(\mathbf{x}) = \frac{\mathbb{V}_{\cal L} \{ \mathbb{E}_{\theta|{\cal L}} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} \}}{\mathbb{V}_{\cal L} \{ \mathbb{E}_{\theta|{\cal L}} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} \} + \mathbb{E}_{\cal L} \{ \mathbb{V}_{\theta|{\cal L}} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} \}}$ + +\vspace{1cm} + +In other words, it is the ratio between +\begin{itemize} +\item the variance due to the learning set and +\item the total variance, accounting for random effects due to both the + learning set and the random perburbations. +\end{itemize} + +\bigskip + +$\rho(\mathbf{x}) \to 1$ when variance is mostly due to the learning set; \\ +$\rho(\mathbf{x}) \to 0$ when variance is mostly due to the random perturbations;\\ +$\rho(\mathbf{x}) \geq 0$. + + +\end{frame} + + \end{document}