1
+
2
+ # ##########Function of FeatureSelection###########
3
+
4
+ FeatureSelection <- function (work , featureSelectionGoal = ' TPR.TNR' ){
5
+
6
+ if (featureSelectionGoal = ' TPR.TNR' ){ # 1.1
7
+
8
+ # Label-free experiments (SWATH)
9
+ if (nlevels(work $ LABEL )== 1 & length(unique(work $ TRANSITION ))> 1 ){ # 1.2
10
+
11
+ N.Prot <- nlevels(work $ PROTEIN )
12
+
13
+ # Create the data frame storing the selected features of each Protein
14
+ FeatureSelection.Out <- data.frame (Protein = vector(), Peptide = vector(), Feature = vector())
15
+ Index.FS <- 0
16
+
17
+ # Conduct the feature selection for each protein
18
+ for (i in 1 : N.Prot ){ # 1.3
19
+
20
+ tem <- subset(work , PROTEIN == unique(PROTEIN )[i ])
21
+ N.Pep <- length(unique(tem $ PEPTIDE ))
22
+ # Create a data frame storing the assessment of the noise for each peptide
23
+ Pep.Out <- data.frame (Peptide = rep(NA , N.Pep ), Model.Based.Error = rep(NA ,N.Pep ))
24
+
25
+ # Also, create the data frames storing the retained features in each peptide
26
+ Retained.Feature <- data.frame (Peptide = vector(), Feature = vector()); Index.f <- 0
27
+
28
+ # In each peptide, select the max(Top 3, Top 40%) of the features based on the within-group variation
29
+ # Then within-group variation is calculated, after blocking the features, for each peptide
30
+ for (j in 1 : N.Pep ){ # 1.4
31
+
32
+ tem2 <- subset(tem , PEPTIDE == unique(PEPTIDE )[j ])
33
+ tem2 $ FEATURE <- factor (tem2 $ FEATURE , levels = unique(tem2 $ FEATURE ))
34
+ # Determine how many features to pick in this peptide
35
+ N.Feature <- length(unique(tem2 $ FEATURE )); Top40 <- round(N.Feature * 0.4 )
36
+ N.Select <- max(3 , Top40 )
37
+ # Create a data frame storing the variance(error) for each feature
38
+ F .Out <- data.frame (Feature = rep(NA , N.Feature ), Error = rep(NA , N.Feature ))
39
+
40
+ # The peptide with less than 3 fragments will not be considered.
41
+ if (N.Feature > = N.Select ){ # 1.4.1
42
+ # The features are ranked by the within-group variation
43
+ for (k in 1 : N.Feature ){ # 1.5
44
+
45
+ tem.f <- subset(tem2 , FEATURE == unique(FEATURE )[k ])
46
+ LM.f <- lm(ABUNDANCE ~ GROUP , data = tem.f )
47
+ F .Out [k , ' Feature' ] <- as.character(unique(tem.f $ FEATURE ))
48
+ F .Out [k , ' Error' ] <- summary(LM.f )$ sigma
49
+
50
+ } # 1.5
51
+
52
+ F .Out $ Rank <- rank(F .Out $ Error )
53
+
54
+ # We work on the max(Top3, Top40) features of this peptide
55
+ Keep.Feature <- F .Out [F .Out $ Rank < = N.Select ,' Feature' ]
56
+ tem2.2 <- subset(tem2 , FEATURE %in% Keep.Feature )
57
+
58
+ # Compute the with-group variation, after blocking for features, of this peptide
59
+ tem2.2 $ FEATURE <- factor (tem2.2 $ FEATURE , levels = unique(tem2.2 $ FEATURE ))
60
+ LM.Pep <- lm(ABUNDANCE ~ FEATURE + GROUP , data = tem2.2 )
61
+
62
+ Pep.Out [j , ' Peptide' ] <- as.character(unique(tem2.2 $ PEPTIDE ))
63
+ Pep.Out [j , ' Model.Based.Error' ] <- summary(LM.Pep )$ sigma
64
+
65
+ # Store which fragments were retained in this peptide
66
+ Retained.Feature [(Index.f + 1 ): (Index.f + N.Select ), ' Peptide' ] <- as.character(unique(tem2 $ PEPTIDE ))
67
+ Retained.Feature [(Index.f + 1 ): (Index.f + N.Select ), ' Feature' ] <- Keep.Feature
68
+
69
+ # Update the current index at 'Retained.Feature'
70
+ Index.f <- Index.f + N.Select
71
+
72
+ } else {
73
+
74
+ # The case of peptides with less than 3 features
75
+ if (N.Feature > 1 ){ # 1.4.2
76
+
77
+ LM.Pep2 <- lm(ABUNDANCE ~ FEATURE + GROUP , data = tem2 )
78
+ Pep.Out [j , ' Peptide' ] <- as.character(unique(tem2 $ PEPTIDE ))
79
+ # Penalize the peptides having less than 3 features
80
+ Pep.Out [j , ' Model.Based.Error' ] <- (1.5 )* summary(LM.Pep2 )$ sigma
81
+
82
+ # Store which fragments were retained in this peptide
83
+ Retained.Feature [(Index.f + 1 ): (Index.f + N.Feature ), ' Peptide' ] <- as.character(unique(tem2 $ PEPTIDE ))
84
+ Retained.Feature [(Index.f + 1 ): (Index.f + N.Feature ), ' Feature' ] <- as.character(unique(tem2 $ FEATURE ))
85
+
86
+ # Update the current index at 'Retained.Feature'
87
+ Index.f <- Index.f + N.Feature
88
+
89
+ } else {
90
+
91
+ LM.Pep2 <- lm(ABUNDANCE ~ GROUP , data = tem2 )
92
+ Pep.Out [j , ' Peptide' ] <- as.character(unique(tem2 $ PEPTIDE ))
93
+ Pep.Out [j , ' Model.Based.Error' ] <- 2 * summary(LM.Pep2 )$ sigma
94
+
95
+ # Store which fragments were retained in this peptide
96
+ Retained.Feature [(Index.f + 1 ): (Index.f + N.Feature ), ' Peptide' ] <- as.character(unique(tem2 $ PEPTIDE ))
97
+ Retained.Feature [(Index.f + 1 ): (Index.f + N.Feature ), ' Feature' ] <- as.character(unique(tem2 $ FEATURE ))
98
+
99
+ # Update the current index at 'Retained.Feature'
100
+ Index.f <- Index.f + N.Feature
101
+
102
+
103
+ } # 1.4.2
104
+
105
+ warning(paste(' The Peptide' , unique(tem2 $ PEPTIDE ), ' is likely to be removed since it has less than 3 fragments. This is our rule imposed in the SWATH experiment.' , sep = ' ' ))
106
+
107
+ } # 1.4.1
108
+
109
+ } # 1.4
110
+
111
+
112
+ # ##Select the best one-fourth of the peptide now. (Deciding the optimal number of the peptides chosen takes too much time, but we are working on reducing its computation time.)
113
+ N.Select.Pep <- max(round(dim(Pep.Out )[1 ]/ 4 ), 1 )
114
+ Pep.Out $ Rank <- rank(Pep.Out $ Model.Based.Error )
115
+ Selected.Pep <- Pep.Out [Pep.Out $ Rank < = N.Select.Pep , ' Peptide' ]
116
+ # ##Pull out the selected features in this peptide
117
+ Selected.Feature <- subset(Retained.Feature , Peptide %in% Selected.Pep )
118
+
119
+ # ##Put the selected features (peptides) of this proteins into the desinated data frame
120
+ N.F <- dim(Selected.Feature )[1 ]
121
+ FeatureSelection.Out [(Index.FS + 1 ): (Index.FS + N.F ), ' Protein' ] <- as.character(unique(tem $ PROTEIN ))
122
+ FeatureSelection.Out [(Index.FS + 1 ): (Index.FS + N.F ), ' Peptide' ] <- Selected.Feature $ Peptide
123
+ FeatureSelection.Out [(Index.FS + 1 ): (Index.FS + N.F ), ' Feature' ] <- Selected.Feature $ Feature
124
+
125
+ Index.FS <- Index.FS + N.F
126
+ } # 1.3 (end of loop for proteins)
127
+
128
+ # ##If not all of the peptides are proteotypic, some proteins may have the same peptides
129
+ work $ Protein_Feature <- paste(work $ PROTEIN , work $ FEATURE , sep = ' _' )
130
+ FeatureSelection.Out $ Protein_Feature <- paste(FeatureSelection.Out $ Protein , FeatureSelection.Out $ Feature , sep = ' _' )
131
+
132
+ # ##Create the label for selected, or removed, features
133
+ work $ Filter <- NA ;
134
+ work [work $ Protein_Feature %in% unique(FeatureSelection.Out $ Protein_Feature ), ' Filter' ] <- ' Selected'
135
+ work [! (work $ Protein_Feature %in% unique(FeatureSelection.Out $ Protein_Feature )), ' Filter' ] <- ' Flagged'
136
+ work $ Protein_Feature <- NULL
137
+
138
+ # ##If we want to remove the bad features directly
139
+ work <- subset(work , Filter == ' Selected' )
140
+
141
+ } # 1.2 (end of label-free SWATH experiment)
142
+
143
+
144
+
145
+ # Label-free experiments (Shotun; DDA)
146
+ if (nlevels(work $ LABEL )== 1 & nlevels(work $ TRANSITION )== 1 ){ # 1.2.2
147
+
148
+ # In DDA (shotgun) experiment, there is no need to filter out bad fragments in each peptide
149
+ N.Prot <- nlevels(work $ PROTEIN )
150
+
151
+ # Create the data frame storing the selected features of each Protein
152
+ FeatureSelection.Out <- data.frame (Protein = vector(), Peptide = vector())
153
+ Index.FS <- 0
154
+
155
+ # ##Rank the peptides for each protein
156
+ for (i in 1 : N.Prot ){ # DDA_1.3
157
+
158
+ DDA.1 <- subset(work , PROTEIN == unique(PROTEIN )[i ])
159
+ N.Pep <- length(unique(DDA.1 $ PEPTIDE ))
160
+
161
+ # Create a data frame storing the assessment of the noise for each peptide
162
+ Pep.Out <- data.frame (Peptide = rep(NA , N.Pep ), Model.Based.Error = rep(NA ,N.Pep ))
163
+
164
+
165
+ # #Assess the within-group variation for each peptide##
166
+
167
+ for (j in 1 : N.Pep ){ # DDA_1.4
168
+
169
+ DDA.Pep <- subset(DDA.1 , PEPTIDE == unique(PEPTIDE )[j ])
170
+ LM.DDA <- lm(ABUNDANCE ~ GROUP , data = DDA.Pep )
171
+ Pep.Out [j , ' Peptide' ] <- as.character(unique(DDA.Pep $ PEPTIDE ))
172
+ Pep.Out [j , ' Model.Based.Error' ] <- summary(LM.DDA )$ sigma
173
+
174
+ } # DDA_1.4 (End of loop for each peptide)
175
+
176
+ # ##We choose the top one-third of the peptides in DDA now. (Again, deciding the optimal number of the peptides we should choose takes too long. We are working on this.)
177
+ N.Select <- max(round(length(unique(DDA.1 $ PEPTIDE ))/ 3 ), 1 )
178
+ Pep.Out $ Rank <- rank(Pep.Out $ Model.Based.Error )
179
+ Pep.Select <- Pep.Out [Pep.Out $ Rank < = N.Select , ' Peptide' ]
180
+
181
+ # ##Pull out the selected peptides to the designated data frame
182
+ FeatureSelection.Out [(Index.FS + 1 ): (Index.FS + N.Select ), ' Protein' ] <- as.character(unique(DDA.1 $ PROTEIN ))
183
+ FeatureSelection.Out [(Index.FS + 1 ): (Index.FS + N.Select ), ' Peptide' ] <- Pep.Select
184
+ Index.FS <- Index.FS + N.Select
185
+ } # DDA_1.3 (End of loop for proteins)
186
+
187
+
188
+ # ##If not all of the peptides are proteotypic, some proteins may have the same peptides
189
+ work $ Protein_Peptide <- paste(work $ PROTEIN , work $ PEPTIDE , sep = ' _' )
190
+ FeatureSelection.Out $ Protein_Peptide <- paste(FeatureSelection.Out $ Protein , FeatureSelection.Out $ Peptide , sep = ' _' )
191
+
192
+ # ##Create the label for selected, or removed, peptides
193
+ work $ Filter <- NA ;
194
+ work [work $ Protein_Peptide %in% unique(FeatureSelection.Out $ Protein_Peptide ), ' Filter' ] <- ' Selected'
195
+ work [! (work $ Protein_Peptide %in% unique(FeatureSelection.Out $ Protein_Peptide )), ' Filter' ] <- ' Flagged'
196
+ work $ Protein_Peptide <- NULL
197
+ work <- subset(work , Filter == ' Selected' )
198
+ } # 1.2.2 (End of the DDA experiment)
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+ # Label-based experiments
208
+ if (nlevels(work $ LABEL )== 2 ){# 1.2.3
209
+
210
+ # There is an issue for Quality control samples to deal with. Will update this part soon.
211
+
212
+ }# 1.2.3
213
+
214
+ } # 1.1
215
+
216
+
217
+
218
+
219
+ # Choose the subset of features for removing the interference
220
+ if (featureSelectionGoal = ' Interference' ){ # 2.1
221
+
222
+
223
+ # ##Update this option later
224
+
225
+
226
+
227
+ } # 2.1
228
+
229
+
230
+ } # End of function 'FeatureSelection'
0 commit comments