R PCA

neurospin · Sep 9, 2016 · 0ca1d53 · 0ca1d53
1 parent ea04dc8
commit 0ca1d53
Show file tree

Hide file tree

Showing 6 changed files with 384 additions and 301 deletions.
diff --git a/R/ml_dimensionality_reduction.R b/R/ml_dimensionality_reduction.R
@@ -1,57 +1,122 @@
+######
 ## PCA
+######
+
+# Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean
+# and principal components directions. `transform(X)` that project a new the data
+# into the principal components.
+# 
+# Check that your `BasicPCA` pfermed simillarly than the one from sklearn:
+#   `from sklearn.decomposition import PCA`
+
+
+BasicPCA <- function(X){
+  obj = list()
+  Xc <- scale(X, center=TRUE, scale=FALSE)
+  obj$mean <- attr(Xc, "scaled:center")
+  s <- svd(Xc, nu = 0)
+  # v [K x P] a matrix whose columns contain the right singular vectors of x
+  obj$V = s$v
+  obj$var = 1 / (nrow(X) - 1) * s$d ^2
+  return(obj)
+}
+
+BasicPCA.transform <- function(obj, X){
+  #Xc = scale(X, center=obj$mean, scale=FALSE)
+  #return(Xc %*% obj$V)
+  scale(X, obj$mean, FALSE) %*% obj$V
+}
 
-# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/
+# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/
+# dataset
+n_samples = 10
+experience = rnorm(n_samples)
+salary = 1500 + experience + .5 * rnorm(n_samples)
+other = rnorm(n_samples)
+X = cbind(experience, salary, other)
 
+Xcs = scale(X, center=TRUE, scale=FALSE)
+attr(Xcs, "scaled:center") = NULL
+attr(Xcs, "scaled:scale") = NULL
 
-user1 = data.frame(name=c('eric', 'sophie'),
-                     age=c(22, 48), gender=c('M', 'F'),
-                     job=c('engineer', 'scientist'))
-
-user2 = data.frame(name=c('alice', 'john', 'peter', 'julie', 'christine'),
-                   age=c(19, 26, 33, 44, 35), gender=c('F', 'M', 'M', 'F', 'F'),
-                   job=c("student", "student", 'engineer', 'scientist', 'scientist'))
+basic_pca = BasicPCA(Xcs)
+obj = basic_pca
+BasicPCA.transform(basic_pca, Xcs)
 
-user3 = rbind(user1, user2)
 
-salary = data.frame(name=c('alice', 'john', 'peter', 'julie'), salary=c(22000, 2400, 3500, 4300))
+# PCA with prcomp
+pca = prcomp(Xcs, center=TRUE, scale.=FALSE)
+names(pca)
+object = pca
 
-user = merge(user3, salary, by="name", all=TRUE)
+all(pca$rotation == basic_pca$V)
 
-user[(user$gender == 'F') & (user$job == 'scientist'), ]
+all(predict(pca, Xcs) == BasicPCA.transform(basic_pca, Xcs))
 
-summary(user)
+predict(pca, Xcs) - BasicPCA.transform(basic_pca, Xcs)
 
-types = NULL
-for(n in colnames(user)){
-  types = rbind(types, data.frame(var=n, 
-                                  type=typeof(user[[n]]),
-                                  isnumeric=is.numeric(user[[n]])))
-}
+cor(predict(pca, Xcs), BasicPCA.transform(basic_pca, Xcs))
+
+newdata = X
+scale(newdata, object$center, object$scale) %*% object$rotation
+
+# "https://raw.github.com/neurospin/pystatsml/master/data/iris.csv"
+# 
+# Describe the data set. Should the dataset been standardized ?
+# 
+# Retrieve the explained variance ratio. Determine $K$ the number of components.
+# 
+# Print the $K$ principal components direction and correlation of the $K$ principal
+# components with original variables. Interpret the contribution of original variables
+# into the PC.
+# 
+# Plot samples projected into the $K$ first PCs.
+# 
+# Color samples with their species.
+#
+
+#setwd("/home/ed203246/git/pystatsml/notebooks")
+df = read.csv("../data/iris.csv")
+
+# Describe the data set. Should the dataset been standardized ?
+summary(df)
 
-typeof(user['name'])
-typeof(user['name'])
+# sepal_length    sepal_width     petal_length    petal_width          species  
+# Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50  
+# 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
+# Median :5.800   Median :3.000   Median :4.350   Median :1.300   virginica :50  
+# Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199                  
+# 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
+# Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500     
 
-link = 'https://raw.github.com/neurospin/pystatsml/master/data/salary_table.csv'
-X = read.csv(url(link))
+numcols = colnames(df)[unlist(lapply(df, is.numeric))]
 
-X = read.csv(curl(link))
 
-install.packages("RCurl")
+# Describe the structure of correlation among variables.
+cor(X)
 
-install.packages(c("curl", "httr"))
-require(RCurl)
-myCsv <- getURL("https://gist.github.com/raw/667867/c47ec2d72801cfd84c6320e1fe37055ffe600c87/test.csv")
-WhatJDwants <- read.csv(textConnection(myCsv))
+# Compute a PCA with the maximum number of compoenents.
+X = df[, numcols]
+apply(X, 2, sd)
 
+Xcs = scale(X, center=TRUE, scale=TRUE)
+attr(Xcs, "scaled:center") = NULL
+attr(Xcs, "scaled:scale") = NULL
+apply(Xcs, 2, sd)
+apply(Xcs, 2, mean)
 
-urlfile<-'https://raw.github.com/aronlindberg/latent_growth_classes/master/LGC_data.csv'
-dsin<-read.csv(curl(link))
+#Compute a PCA with the maximum number of compoenents.
+pca = prcomp(Xcs)
 
-library(curl)
+(pca$sdev ** 2) / sum(pca$sdev ** 2)
+cumsum(pca$sdev ** 2) / sum(pca$sdev ** 2)
 
-x <- read.csv( curl("https://raw.githubusercontent.com/trinker/dummy/master/data/gcircles.csv") )
+# K = 2
+names(pca)
+pca$rotation
 
-download.file("https://raw.github.com/aronlindberg/latent_growth_classes/master/LGC_data.csv", 
-              destfile = "/tmp/test.csv", method = "curl")
+PC = predict(pca, Xcs)
+t(cor(Xcs, PC[, 1:2]))
 
-download.file(link,  destfile = "/tmp/test.csv", method = "curl")
+library(ggplot2)
+qplot(PC)
diff --git a/notebooks/ml_dimensionality_reduction.ipynb b/notebooks/ml_dimensionality_reduction.ipynb
@@ -290,7 +290,11 @@
     " &=\\mathbf{V}\\mathbf{D}^2\\mathbf{V}^T\n",
     "\\end{align*}.\n",
     "\n",
-    "It turns out that if you have done the singular value decomposition then you already have the Eigen value decomposition for $\\mathbf{X}^T\\mathbf{X}$. Indeed the right singular vectors $\\mathbf{U}$ of $\\mathbf{X}$ are equivalent to the eigenvectors of $\\mathbf{X}^T\\mathbf{X}$, while the singular values $d_k$ of $\\mathbf{X}$ are equal to the square roots of the eigenvalues $\\lambda_k$ of $\\mathbf{X}^T\\mathbf{X}$. Moreover computing PCA with SVD do not require to form the matrix $\\mathbf{X}^T\\mathbf{X}$, so computing the SVD is now the standard way to calculate a principal components analysis from a data matrix, unless only a handful of components are required.\n",
+    "It turns out that if you have done the singular value decomposition then you already have the Eigen value decomposition for $\\mathbf{X}^T\\mathbf{X}$:\n",
+    "- The right singular vectors $\\mathbf{V}$ of $\\mathbf{X}$ are equivalent to the eigenvectors of $\\mathbf{X}^T\\mathbf{X}$.\n",
+    "- The singular values $d_k$ of $\\mathbf{X}$ are equal to the square roots of the eigenvalues $\\lambda_k$ of $\\mathbf{X}^T\\mathbf{X}$.\n",
+    "\n",
+    "Moreover computing PCA with SVD do not require to form the matrix $\\mathbf{X}^T\\mathbf{X}$, so computing the SVD is now the standard way to calculate a principal components analysis from a data matrix, unless only a handful of components are required.\n",
     "\n",
     "#### PCA outputs\n",
     "\n",
@@ -403,16 +407,22 @@
     "### 1. Write a basic PCA class\n",
     "\n",
     "Write a class `BasicPCA` with two methods:\n",
-    "- `fit(X)` that estimates the data mean and principal components directions.\n",
+    "- `fit(X)` that estimates the data mean, principal components directions $\\textbf{V}$ and the explained variance of each component.\n",
+    "\n",
     "- `transform(X)` that project a new the data into the principal components.\n",
     "\n",
     "Check that your `BasicPCA` pfermed simillarly than the one from sklearn.\n",
     "\n",
     "### Apply your sklearn PCA on `iris` dataset available at: \n",
+    "\n",
     "'https://raw.github.com/neurospin/pystatsml/master/data/iris.csv'\n",
     "\n",
     "- Describe the data set. Should the dataset been standardized ?\n",
     "\n",
+    "- Describe the structure of correlation among variables.\n",
+    "\n",
+    "- Compute a PCA with the maximum number of compoenents.\n",
+    "\n",
     "- Retrieve the explained variance ratio. Determine $K$ the number of components.\n",
     "\n",
     "- Print the $K$ principal components direction and correlation of the $K$ principal components with original variables. Interpret the contribution of original variables into the PC.\n",