-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquiz2.r
77 lines (74 loc) · 3.26 KB
/
quiz2.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#Q2
#Make a plot of the outcome (CompressiveStrength) versus the index of the samples.
#Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful
#for turning continuous covariates into factors).
#What do you notice in these plots?
#------------------------------------#
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
library(ggplot2)
set.seed(975)
inTrain=createDataPartition(mixtures$CompressiveStrength,p=3/4)[[1]]
training=mixtures[ inTrain,]
testing=mixtures[-inTrain,]
library(Hmisc)
cut2(training$CompressiveStrength)
ggplot()+geom_point(aes(x=seq_along(CompressiveStrength),y=CompressiveStrength,colour=cut2(Water)),data=training)
#-------------------------------------#
#Q3
#Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform
#to try to make the data more symmetric. Why would that be a poor choice for this variable?
#-------------------------------------#
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(975)
inTrain=createDataPartition(mixtures$CompressiveStrength,p=3/4)[[1]]
training=mixtures[inTrain,]
testing=mixtures[-inTrain,]
hist(log(training$Superplasticizer))
#-------------------------------------#
#Q4
#Find all the predictor variables in the training set that begin with IL. Perform principal components
#on these variables with the preProcess() function from the caret package.
#Calculate the number of principal components needed to capture 90% of the variance. How many are there?
#-------------------------------------#
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData=data.frame(diagnosis,predictors)
inTrain=createDataPartition(adData$diagnosis,p=3/4)[[1]]
training=adData[inTrain,]
testing=adData[-inTrain,]
dt=training[,grepl("IL",colnames(training))]
dt=subset(dt,select=-13)
pre=preProcess(dt,thresh=0.9,method="pca")
pre
#-------------------------------------#
#Q5
#Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis.
#Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining
#80% of the variance in the predictors.
#Use method="glm" in the train function. What is the accuracy of each method in the test set? Which is more accurate?
#-------------------------------------#
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData=data.frame(diagnosis,predictors)
inTrain=createDataPartition(adData$diagnosis,p=3/4)[[1]]
training=adData[inTrain,]
testing=adData[-inTrain,]
dt=training[,grepl("IL",colnames(training))]
dt=subset(dt,select=-13)
pre=preProcess(dt,thresh=0.8,method="pca") # find PCA that containing 80% of the variance in the predictors
trainP=predict(pre,dt) #transform training data
df=train(training$diagnosis~.,method="glm",data=trainP) #fit a glm model
dt.t=testing[,grepl("IL",colnames(testing))] #transform testing data
dt.t=subset(dt.t,select=-13)
testP=predict(pre,dt.t)
confusionMatrix(testing$diagnosis,predict(df,testP)) #confusion matrix of using PCA
df=train(training$diagnosis~.,method="glm",data=dt)
confusionMatrix(testing$diagnosis,predict(df,testing)) #confusion matrix not using PCA