Skip to content

Commit 29c989b

Browse files
committed
Merge pull request Vitek-Lab#5 from mikecheng1987/master
Add the function FeatureSelection
2 parents 48ef355 + 6f5c589 commit 29c989b

File tree

2 files changed

+235
-6
lines changed

2 files changed

+235
-6
lines changed

R/FeatureSelection.R

+230
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
2+
###########Function of FeatureSelection###########
3+
4+
FeatureSelection <- function(work, featureSelectionGoal='TPR.TNR'){
5+
6+
if(featureSelectionGoal='TPR.TNR'){ #1.1
7+
8+
#Label-free experiments (SWATH)
9+
if(nlevels(work$LABEL)==1 & length(unique(work$TRANSITION))>1){ #1.2
10+
11+
N.Prot <- nlevels(work$PROTEIN)
12+
13+
#Create the data frame storing the selected features of each Protein
14+
FeatureSelection.Out <- data.frame(Protein=vector(), Peptide=vector(), Feature=vector())
15+
Index.FS <- 0
16+
17+
#Conduct the feature selection for each protein
18+
for(i in 1:N.Prot){ #1.3
19+
20+
tem <- subset(work, PROTEIN==unique(PROTEIN)[i])
21+
N.Pep <- length(unique(tem$PEPTIDE))
22+
#Create a data frame storing the assessment of the noise for each peptide
23+
Pep.Out <- data.frame(Peptide=rep(NA, N.Pep), Model.Based.Error=rep(NA,N.Pep))
24+
25+
#Also, create the data frames storing the retained features in each peptide
26+
Retained.Feature <- data.frame(Peptide=vector(), Feature=vector()); Index.f <- 0
27+
28+
#In each peptide, select the max(Top 3, Top 40%) of the features based on the within-group variation
29+
#Then within-group variation is calculated, after blocking the features, for each peptide
30+
for(j in 1:N.Pep){ #1.4
31+
32+
tem2 <- subset(tem, PEPTIDE==unique(PEPTIDE)[j])
33+
tem2$FEATURE <- factor(tem2$FEATURE, levels=unique(tem2$FEATURE))
34+
#Determine how many features to pick in this peptide
35+
N.Feature <- length(unique(tem2$FEATURE)); Top40 <- round(N.Feature*0.4)
36+
N.Select <- max(3, Top40)
37+
#Create a data frame storing the variance(error) for each feature
38+
F.Out <- data.frame(Feature=rep(NA, N.Feature), Error=rep(NA, N.Feature))
39+
40+
#The peptide with less than 3 fragments will not be considered.
41+
if(N.Feature>=N.Select){ #1.4.1
42+
#The features are ranked by the within-group variation
43+
for(k in 1:N.Feature){ #1.5
44+
45+
tem.f <- subset(tem2, FEATURE==unique(FEATURE)[k])
46+
LM.f <- lm(ABUNDANCE ~ GROUP, data=tem.f)
47+
F.Out[k, 'Feature'] <- as.character(unique(tem.f$FEATURE))
48+
F.Out[k, 'Error'] <- summary(LM.f)$sigma
49+
50+
} #1.5
51+
52+
F.Out$Rank <- rank(F.Out$Error)
53+
54+
#We work on the max(Top3, Top40) features of this peptide
55+
Keep.Feature <- F.Out[F.Out$Rank <= N.Select,'Feature']
56+
tem2.2 <- subset(tem2, FEATURE %in% Keep.Feature)
57+
58+
#Compute the with-group variation, after blocking for features, of this peptide
59+
tem2.2$FEATURE <- factor(tem2.2$FEATURE, levels=unique(tem2.2$FEATURE))
60+
LM.Pep <- lm(ABUNDANCE ~ FEATURE + GROUP, data=tem2.2)
61+
62+
Pep.Out[j, 'Peptide'] <- as.character(unique(tem2.2$PEPTIDE))
63+
Pep.Out[j, 'Model.Based.Error'] <- summary(LM.Pep)$sigma
64+
65+
#Store which fragments were retained in this peptide
66+
Retained.Feature[(Index.f+1):(Index.f+N.Select), 'Peptide'] <- as.character(unique(tem2$PEPTIDE))
67+
Retained.Feature[(Index.f+1):(Index.f+N.Select), 'Feature'] <- Keep.Feature
68+
69+
#Update the current index at 'Retained.Feature'
70+
Index.f <- Index.f+N.Select
71+
72+
} else{
73+
74+
#The case of peptides with less than 3 features
75+
if(N.Feature > 1){ #1.4.2
76+
77+
LM.Pep2 <- lm(ABUNDANCE ~ FEATURE + GROUP, data=tem2)
78+
Pep.Out[j, 'Peptide'] <- as.character(unique(tem2$PEPTIDE))
79+
#Penalize the peptides having less than 3 features
80+
Pep.Out[j, 'Model.Based.Error'] <- (1.5)*summary(LM.Pep2)$sigma
81+
82+
#Store which fragments were retained in this peptide
83+
Retained.Feature[(Index.f+1):(Index.f+N.Feature), 'Peptide'] <- as.character(unique(tem2$PEPTIDE))
84+
Retained.Feature[(Index.f+1):(Index.f+N.Feature), 'Feature'] <- as.character(unique(tem2$FEATURE))
85+
86+
#Update the current index at 'Retained.Feature'
87+
Index.f <- Index.f+N.Feature
88+
89+
} else{
90+
91+
LM.Pep2 <- lm(ABUNDANCE ~ GROUP, data=tem2)
92+
Pep.Out[j, 'Peptide'] <- as.character(unique(tem2$PEPTIDE))
93+
Pep.Out[j, 'Model.Based.Error'] <- 2*summary(LM.Pep2)$sigma
94+
95+
#Store which fragments were retained in this peptide
96+
Retained.Feature[(Index.f+1):(Index.f+N.Feature), 'Peptide'] <- as.character(unique(tem2$PEPTIDE))
97+
Retained.Feature[(Index.f+1):(Index.f+N.Feature), 'Feature'] <- as.character(unique(tem2$FEATURE))
98+
99+
#Update the current index at 'Retained.Feature'
100+
Index.f <- Index.f+N.Feature
101+
102+
103+
} #1.4.2
104+
105+
warning(paste('The Peptide', unique(tem2$PEPTIDE), 'is likely to be removed since it has less than 3 fragments. This is our rule imposed in the SWATH experiment.', sep=' '))
106+
107+
} #1.4.1
108+
109+
} #1.4
110+
111+
112+
###Select the best one-fourth of the peptide now. (Deciding the optimal number of the peptides chosen takes too much time, but we are working on reducing its computation time.)
113+
N.Select.Pep <- max(round(dim(Pep.Out)[1]/4), 1)
114+
Pep.Out$Rank <- rank(Pep.Out$Model.Based.Error)
115+
Selected.Pep <- Pep.Out[Pep.Out$Rank <= N.Select.Pep, 'Peptide']
116+
###Pull out the selected features in this peptide
117+
Selected.Feature <- subset(Retained.Feature, Peptide %in% Selected.Pep)
118+
119+
###Put the selected features (peptides) of this proteins into the desinated data frame
120+
N.F <- dim(Selected.Feature)[1]
121+
FeatureSelection.Out[(Index.FS+1):(Index.FS+N.F), 'Protein'] <- as.character(unique(tem$PROTEIN))
122+
FeatureSelection.Out[(Index.FS+1):(Index.FS+N.F), 'Peptide'] <- Selected.Feature$Peptide
123+
FeatureSelection.Out[(Index.FS+1):(Index.FS+N.F), 'Feature'] <- Selected.Feature$Feature
124+
125+
Index.FS <- Index.FS+N.F
126+
} #1.3 (end of loop for proteins)
127+
128+
###If not all of the peptides are proteotypic, some proteins may have the same peptides
129+
work$Protein_Feature <- paste(work$PROTEIN, work$FEATURE, sep='_')
130+
FeatureSelection.Out$Protein_Feature <- paste(FeatureSelection.Out$Protein, FeatureSelection.Out$Feature, sep='_')
131+
132+
###Create the label for selected, or removed, features
133+
work$Filter <- NA;
134+
work[work$Protein_Feature %in% unique(FeatureSelection.Out$Protein_Feature), 'Filter'] <- 'Selected'
135+
work[!(work$Protein_Feature %in% unique(FeatureSelection.Out$Protein_Feature)), 'Filter'] <- 'Flagged'
136+
work$Protein_Feature <- NULL
137+
138+
###If we want to remove the bad features directly
139+
work <- subset(work, Filter=='Selected')
140+
141+
} #1.2 (end of label-free SWATH experiment)
142+
143+
144+
145+
#Label-free experiments (Shotun; DDA)
146+
if(nlevels(work$LABEL)==1 & nlevels(work$TRANSITION)==1){ #1.2.2
147+
148+
#In DDA (shotgun) experiment, there is no need to filter out bad fragments in each peptide
149+
N.Prot <- nlevels(work$PROTEIN)
150+
151+
#Create the data frame storing the selected features of each Protein
152+
FeatureSelection.Out <- data.frame(Protein=vector(), Peptide=vector())
153+
Index.FS <- 0
154+
155+
###Rank the peptides for each protein
156+
for(i in 1:N.Prot){ #DDA_1.3
157+
158+
DDA.1 <- subset(work, PROTEIN == unique(PROTEIN)[i])
159+
N.Pep <- length(unique(DDA.1$PEPTIDE))
160+
161+
#Create a data frame storing the assessment of the noise for each peptide
162+
Pep.Out <- data.frame(Peptide=rep(NA, N.Pep), Model.Based.Error=rep(NA,N.Pep))
163+
164+
165+
##Assess the within-group variation for each peptide##
166+
167+
for(j in 1:N.Pep){ #DDA_1.4
168+
169+
DDA.Pep <- subset(DDA.1, PEPTIDE==unique(PEPTIDE)[j])
170+
LM.DDA <- lm(ABUNDANCE ~ GROUP, data=DDA.Pep)
171+
Pep.Out[j, 'Peptide'] <- as.character(unique(DDA.Pep$PEPTIDE))
172+
Pep.Out[j, 'Model.Based.Error'] <- summary(LM.DDA)$sigma
173+
174+
} #DDA_1.4 (End of loop for each peptide)
175+
176+
###We choose the top one-third of the peptides in DDA now. (Again, deciding the optimal number of the peptides we should choose takes too long. We are working on this.)
177+
N.Select <- max(round(length(unique(DDA.1$PEPTIDE))/3), 1)
178+
Pep.Out$Rank <- rank(Pep.Out$Model.Based.Error)
179+
Pep.Select <- Pep.Out[Pep.Out$Rank <= N.Select, 'Peptide']
180+
181+
###Pull out the selected peptides to the designated data frame
182+
FeatureSelection.Out[(Index.FS+1):(Index.FS+N.Select), 'Protein'] <- as.character(unique(DDA.1$PROTEIN))
183+
FeatureSelection.Out[(Index.FS+1):(Index.FS+N.Select), 'Peptide'] <- Pep.Select
184+
Index.FS <- Index.FS+N.Select
185+
} #DDA_1.3 (End of loop for proteins)
186+
187+
188+
###If not all of the peptides are proteotypic, some proteins may have the same peptides
189+
work$Protein_Peptide <- paste(work$PROTEIN, work$PEPTIDE, sep='_')
190+
FeatureSelection.Out$Protein_Peptide <- paste(FeatureSelection.Out$Protein, FeatureSelection.Out$Peptide, sep='_')
191+
192+
###Create the label for selected, or removed, peptides
193+
work$Filter <- NA;
194+
work[work$Protein_Peptide %in% unique(FeatureSelection.Out$Protein_Peptide), 'Filter'] <- 'Selected'
195+
work[!(work$Protein_Peptide %in% unique(FeatureSelection.Out$Protein_Peptide)), 'Filter'] <- 'Flagged'
196+
work$Protein_Peptide <- NULL
197+
work <- subset(work, Filter=='Selected')
198+
} #1.2.2 (End of the DDA experiment)
199+
200+
201+
202+
203+
204+
205+
206+
207+
#Label-based experiments
208+
if(nlevels(work$LABEL)==2){#1.2.3
209+
210+
#There is an issue for Quality control samples to deal with. Will update this part soon.
211+
212+
}#1.2.3
213+
214+
} #1.1
215+
216+
217+
218+
219+
#Choose the subset of features for removing the interference
220+
if(featureSelectionGoal='Interference'){ #2.1
221+
222+
223+
###Update this option later
224+
225+
226+
227+
} #2.1
228+
229+
230+
} #End of function 'FeatureSelection'

R/mainfunctions.R

+5-6
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ dataProcess<-function(raw,logTrans=2, normalization="equalizeMedians",nameStanda
373373

374374
work$GROUP_ORIGINAL<-factor(work$GROUP_ORIGINAL)
375375
work$SUBJECT_ORIGINAL<-factor(work$SUBJECT_ORIGINAL,levels=unique(work$SUBJECT_ORIGINAL))
376+
work$LABEL <- unique(work$LABEL, levels=unique(work$LABEL))
376377

377378
work[work$LABEL=="L","GROUP"]<-work[work$LABEL=="L","GROUP_ORIGINAL"]
378379
work[work$LABEL=="L","SUBJECT"]<-work[work$LABEL=="L","SUBJECT_ORIGINAL"]
@@ -1691,19 +1692,17 @@ dataProcess<-function(raw,logTrans=2, normalization="equalizeMedians",nameStanda
16911692

16921693
processout<-rbind(processout,c("* Use feature selection algorithm in order to get high quality features."))
16931694
write.table(processout, file=finalfile, row.names=FALSE)
1694-
1695-
tempfeature<-try(.FeatureSelection1(work,lambda,eta, address),silent=TRUE)
16961695

1697-
if(class(tempfeature)=="try-error") {
1698-
message("*** error : can't select the best features. Now use all features.")
1696+
if(nlevels(work$LABEL)==2) {
1697+
message("*** error : High quality selection will accomodate the label-based experiment soon, but not now. Now use all features.")
16991698

1700-
processout<-rbind(processout,c(paste("error : can't select the best features. Now use all features.")))
1699+
processout<-rbind(processout,c(paste("error : High quality selection will accomodate the label-based experiment soon, but not now. Now use all features.")))
17011700
write.table(processout, file=finalfile, row.names=FALSE)
17021701

17031702
work<-work
17041703

17051704
}else{
1706-
work<-tempfeature
1705+
work<-FeatureSelection(work)
17071706
}
17081707
}
17091708

0 commit comments

Comments
 (0)