-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCRC_lipids_biocrates.R
320 lines (251 loc) · 14 KB
/
CRC_lipids_biocrates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# CRC lipids study
# 112 lipids in 3223 participants, no missing values
# Get data from CRC_prep_data_rev and remove unneeded objects
source("CRC_prep_data.R")
rm(list = ls(pattern = "1|2|bmi"))
# Join GRS data
snps <- read_dta("clrt_gwas_gecco_snps_GRS.dta")
#csnp <- inner_join(crc, snps, by = "Idepic")
crc1 <- left_join(crc, snps, by = "Idepic")
table(CT = csnp$Cncr_Caco_Clrt, lab = csnp$lab)
table(CT = csnp$Cncr_Caco_Clrt)
# Continuous models (needs imputation and log transformation)
# Remove non-matched subjects
#crc1 <- crc %>% group_by(Match_Caseset) %>% filter(n() == 2) %>% ungroup()
# Filter by fasting status. Fasting is 2 cf email Jelena 16 Oct 2018
table(crc$Fasting_C) # 1315, 656 controls 659 cases
crc <- crc %>% filter(Fasting_C == 2)
# Select the lipids only (112) and convert zeros to NA
mat1 <- crc %>% select(Acylcarn_C10:Acylcarn_C8, Glyceroph_Lysopc_A_C16_0:Sphingo_Sm_C26_1) %>% na_if(0)
mat2 <- colon %>% select(Acylcarn_C10:Acylcarn_C8, Glyceroph_Lysopc_A_C16_0:Sphingo_Sm_C26_1) %>% na_if(0)
mat3 <- prox %>% select(Acylcarn_C10:Acylcarn_C8, Glyceroph_Lysopc_A_C16_0:Sphingo_Sm_C26_1) %>% na_if(0)
mat4 <- dist %>% select(Acylcarn_C10:Acylcarn_C8, Glyceroph_Lysopc_A_C16_0:Sphingo_Sm_C26_1) %>% na_if(0)
mat5 <- rectal %>% select(Acylcarn_C10:Acylcarn_C8, Glyceroph_Lysopc_A_C16_0:Sphingo_Sm_C26_1) %>% na_if(0)
# Amino acids
mat1 <- crc %>% select(contains("Aminoacid_")) %>% na_if(0)
mat1a <- mat1 %>% select_if(~ sum(is.na(.)) < 1000)
# Check for missings
hist(colSums(is.na(mat1)), breaks = 30)
library(zoo)
scalemat1 <- mat1 %>% na.aggregate(FUN = function(x) min(x)/2) %>% log2 %>% scale
scalemat2 <- mat2 %>% na.aggregate(FUN = function(x) min(x)/2) %>% log2 %>% scale
scalemat3 <- mat3 %>% na.aggregate(FUN = function(x) min(x)/2) %>% log2 %>% scale
scalemat4 <- mat4 %>% na.aggregate(FUN = function(x) min(x)/2) %>% log2 %>% scale
scalemat5 <- mat5 %>% na.aggregate(FUN = function(x) min(x)/2) %>% log2 %>% scale
library(Amelia)
missmap(as_tibble(scalemat1), rank.order = F)
heatmap.2(scalemat1, trace = "none", col = colpalette1)
library(corrplot)
cormat <- cor(scalemat1)
corrplot(cormat, method = "color", order = "hclust", tl.col = "black", tl.cex = 0.7)
### Continuous models per SD increase
# Define function to apply across quartiles (already matched by lab)
library(survival)
multiclr <- function(x, dat) {
clogit(Cncr_Caco_Clrt ~ x + Bmi_C + Qe_Energy + L_School + Smoke_Stat + Smoke_Int + Height_C +
Qe_Alc + Qge0701 + Qge0704 + strata(Match_Caseset), data = dat)
}
library(broom)
# Apply models by subsite (use 2nd fn parameter as an option)
# CRC
mods1 <- apply(scalemat1, 2, multiclr, dat = crc) %>%
map_df( ~ tidy(., exponentiate = T, conf.int = T)) %>%
filter(grepl("x", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr"))
# Add compound names and col for annotated lipids
mods1a <- mods1 %>%
mutate(compound = colnames(mat1), cmpd.lab = ifelse(-log10(p.value) > 1.8, compound, NA))
pFDR <- mods1 %>% filter(p.adj <= 0.05) %>% select(p.value) %>% max
# Fasting (nothing under PFDR)
mods1a <- mods1 %>%
mutate(compound = colnames(mat1), cmpd.lab = ifelse(-log10(p.value) > 1.3, compound, NA))
# Colon, prox colon, dist colon, rectal
mods2 <- apply(scalemat2, 2, multiclr, dat = colon) %>%
map_df( ~ tidy(., exponentiate = T, conf.int = T)) %>%
filter(grepl("x", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr"))
mods3 <- apply(scalemat3, 2, multiclr, dat = prox) %>% map_df( ~ tidy(., exponentiate = T)) %>%
filter(grepl("x", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr"))
mods4 <- apply(scalemat4, 2, multiclr, dat = dist) %>% map_df( ~ tidy(., exponentiate = T)) %>%
filter(grepl("x", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr"))
mods5 <- apply(scalemat5, 2, multiclr, dat = rectal) %>% map_df( ~ tidy(., exponentiate = T)) %>%
filter(grepl("x", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr"))
#crccon <- map_df(mods, ~ tidy(., exponentiate = T)) %>% filter(grepl("x", term)) %>%
# mutate_if(is.numeric, ~round(., 4)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
# unite(OR.CI, estimate, conf.low, conf.high, sep = "-") %>% cbind(cmpd = colnames(scalemat))
### Categorical associations (doesn't need imputation or scaling)
# First need to run CRC_data_prep to get crc and crc3 objects
library(ggplot)
library(broom)
# Biocrates compounds. Split into quartiles with cut_number
mat6 <- mat1 %>% mutate_all(~cut_number(., n = 4, labels = 1:4))
mat7 <- mat2 %>% mutate_all(~cut_number(., n = 4, labels = 1:4))
mat8 <- mat3 %>% mutate_all(~cut_number(., n = 4, labels = 1:4))
mat9 <- mat4 %>% mutate_all(~cut_number(., n = 4, labels = 1:4))
mat10 <- mat5 %>% mutate_all(~cut_number(., n = 4, labels = 1:4))
# All subjects
fits1 <- apply(mat6, 2, multiclr, dat = crc) %>% map_df( ~tidy(., exponentiate = T)) %>%
filter(grepl("x4", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr"))
fits1a <- fits1 %>% mutate(compound = colnames(mat1),
cmpd.lab = ifelse(-log10(p.value) > 1.4, compound, NA))
# Fasting only
fits1a <- fits1 %>% mutate(compound = colnames(mat1),
cmpd.lab = ifelse(-log10(p.value) > 1.3, compound, NA))
pFDR1 <- fits1a %>% filter(p.adj <= 0.05) %>% select(p.value) %>% max
#%>%
#mutate_if(is.numeric, ~round(., 3)) #%>%
#unite(OR.CI, estimate, conf.low, conf.high, sep = "-") %>%
#cbind(cmpd = colnames(df3)) %>% group_by(cmpd) %>% filter(min(p.value) < 0.05)
fits2 <- apply(mat7, 2, multiclr, dat = colon) %>% map_df( ~tidy(., exponentiate = T)) %>%
filter(grepl("x4", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
mutate_if(is.numeric, ~round(., 3))
fits3 <- apply(mat8, 2, multiclr, dat = prox) %>% map_df( ~tidy(., exponentiate = T)) %>%
filter(grepl("x4", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
mutate_if(is.numeric, ~round(., 3))
fits4 <- apply(mat9, 2, multiclr, dat = dist) %>% map_df( ~tidy(., exponentiate = T)) %>%
filter(grepl("x4", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
mutate_if(is.numeric, ~round(., 3))
fits5 <- apply(mat10, 2, multiclr, dat = rectal) %>% map_df( ~tidy(., exponentiate = T)) %>%
filter(grepl("x4", term)) %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
mutate_if(is.numeric, ~round(., 3))
# Smile plots for manuscript. Build plot from background to foreground. Colorectal:
library(ggrepel)
ggplot(mods1a, aes(x = estimate, y = -log10(p.value))) +
geom_vline(xintercept = 1, colour = "grey60") +
geom_hline(yintercept = -log10(0.05), size = 0.2, colour = "grey60") +
#geom_hline(yintercept = 3.7, size = 0.2, colour = "grey60") +
geom_text_repel(aes(label = cmpd.lab), size = 3, segment.color = "grey") +
geom_point(shape = 21, fill = "dodgerblue") +
theme_bw() + #ggtitle("Continuous models") +
ggtitle("Continuous models, fasted") +
xlab("OR per SD increase conc") +
#annotate("text", x = 0.98, y=c(1.2, 3.6), size = 3, hjust = 0,
#label = c("Raw P threshold", "FDR P threshold")) +
annotate("text", x = 1.05, y=1.35, size = 3, hjust = 0, label = "Raw P threshold") +
theme(panel.grid.major = element_blank())
# Subsites:
all <- bind_rows(colon = mods2, prox = mods3, dist = mods4, rectal = mods5, .id = "subsite") %>%
bind_cols(compound = rep(colnames(mat1), 4)) %>%
mutate(cmpd.lab = ifelse(-log10(p.value) > 1.7, compound, NA))
labs <- c(Colon = "colon", `Distal colon` = "dist", `Proximal colon` = "prox", Rectum = "rectal")
ggplot(all, aes(x = estimate, y = -log10(p.value))) + geom_point(shape = 1) +
facet_wrap(subsite ~ .) + theme_bw() + geom_vline(xintercept = 1, colour = "grey60") +
geom_hline(yintercept = -log10(0.05), size = 0.2, colour = "grey60") +
theme(panel.grid.major = element_blank()) +
geom_text(aes(label = cmpd.lab), size = 3, vjust = 1) +
xlab("OR per SD increase")
# Categorical
ggplot(fits1a, aes(x = estimate, y = -log10(p.value))) +
geom_vline(xintercept = 1, colour = "grey60") +
geom_hline(yintercept = -log10(0.05), size = 0.2, colour = "grey60") +
#geom_hline(yintercept = 3.8, size = 0.2, colour = "grey60") +
geom_text_repel(aes(label = cmpd.lab), size = 3, segment.color = "grey") +
geom_point(shape = 21, fill = "limegreen") +
theme_bw() + #ggtitle("Categorical models") +
ggtitle("Categorical models, fasted") +
xlab("OR for Q4 vs Q1 conc") +
#annotate("text", x = 1.02, y=c(1.2, 3.7), size = 3, hjust = 0,
#label = c("Raw P threshold", "FDR P threshold")) +
annotate("text", x = 1.02, y= 1.35, size = 3, hjust = 0, label = "Raw P threshold") +
theme(panel.grid.major = element_blank())
### Polygenic risk scores
snps <- read_dta("clrt_gwas_gecco_snps_GRS.dta")
csnp <- inner_join(crc, snps, by = "Idepic")
table(CT = csnp$Cncr_Caco_Clrt, lab = csnp$lab)
table(CT = csnp$Cncr_Caco_Clrt)
plot(csnp$GRS)
csnp$GRScat <- cut_number(csnp$GRS, n = 4, labels = 1:4)
table(CT = csnp$Cncr_Caco_Clrt, Q = csnp$GRScat)
chisq.test(csnp$Cncr_Caco_Clrt, csnp$GRScat)
ggplot(csnp, aes(x = GRS, group = as.factor(Cncr_Caco_Clrt))) +
geom_density(aes(fill = as.factor(Cncr_Caco_Clrt)), alpha = 0.4)
# Metabolite associations with covariates (idea from Jelena thesis p189)
# Subset control matrix
ints <- log2(mat1[crc$Cncr_Caco_Clrt == 0, ])
meta1 <- crc1[crc1$Cncr_Caco_Clrt == 0, ]
# Different colour palettes for plot
library(RColorBrewer)
set2 <- rep(brewer.pal(8, "Set2"), each = 112, length.out = nrow(all))
lm1 <- function(x) lm(as.numeric(Fasting_C) ~ x + Sex + lab, data = meta1)
lm2 <- function(x) lm(as.numeric(Sex) ~ x + Fasting_C + lab, data = meta1)
lm3 <- function(x) lm(as.numeric(lab) ~ x + Fasting_C + Sex, data = meta1)
lm4 <- function(x) lm(Qe_Alc ~ x + Fasting_C + Sex + lab, data = meta1)
lm5 <- function(x) lm(as.numeric(Smoke_Stat) ~ x + Fasting_C + Sex + lab, data = meta1)
lm6 <- function(x) lm(as.numeric(Smoke_Int) ~ x + Fasting_C + Sex + lab, data = meta1)
lm7 <- function(x) lm(Age_Blood ~ x + Bmi_C + Fasting_C + Sex + lab, data = meta1)
lm8 <- function(x) lm(Bmi_C ~ x + Fasting_C + Sex + lab, data = meta1)
lm9 <- function(x) lm(Waist_C ~ x + Fasting_C + Sex + lab, data = meta1)
lm10 <- function(x) lm(Qge0701 ~ x + Fasting_C + Sex + lab, data = meta1)
lm11 <- function(x) lm(Qge0704 ~ x + Fasting_C + Sex + lab, data = meta1)
lm12 <- function(x) lm(GRS ~ x + Fasting_C + Sex + lab, data = meta1)
library(broom)
fits1 <- apply(ints, 2, lm1) %>% map_df(tidy) #%>% filter(str_detect(term, "x"))
fits2 <- apply(ints, 2, lm2) %>% map_df(tidy)
fits3 <- apply(ints, 2, lm3) %>% map_df(tidy)
fits4 <- apply(ints, 2, lm4) %>% map_df(tidy)
fits5 <- apply(ints, 2, lm5) %>% map_df(tidy)
fits6 <- apply(ints, 2, lm6) %>% map_df(tidy)
fits7 <- apply(ints, 2, lm7) %>% map_df(tidy)
fits8 <- apply(ints, 2, lm8) %>% map_df(tidy)
fits9 <- apply(ints, 2, lm9) %>% map_df(tidy)
fits10 <- apply(ints, 2, lm10) %>% map_df(tidy)
fits11 <- apply(ints, 2, lm11) %>% map_df(tidy)
fits12 <- apply(ints, 2, lm12) %>% map_df(tidy)
# Bind covariates of interest (1-3 are just for adjustment)
all <- bind_rows(fits4, fits5, fits6, fits7, fits8, fits9, fits10, fits11, fits12) %>%
filter(term == "x") %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
bind_cols(cmpd = rep(colnames(ints), 9))
# Get FDR p-value cutpoint
fdrP <- all %>% filter(p.adj < 0.05) %>% select(p.value) %>% max
# x axis width
x <- 1:nrow(all)
# draw empty plot
plot(NULL, xlim=c(0, nrow(all)), ylim=c(0, max(-log10(all$p.value))), xaxt='n',
ylab='-log10(p-value)', xlab='')
points(x, -log10(all$p.value), pch=19, col=set2, cex = 0.6)
# Make and plot axis labels
labs <- c("Alc", "Smoke St", "Smoke Int", "Age", "BMI", "Waist", "Red meat", "Proc meat",
"Genetic risk")
midpoint <- ncol(mat1)/2
at.labs <- seq(midpoint, 2*midpoint*length(labs), 2*midpoint)
axis(1, at = at.labs, labels = labs, las=1, cex.axis = 0.7)
# Add annotations
all1 <- all %>% mutate(p.low = ifelse(-log10(p.value) > 20, p.value, NA))
text(1:nrow(all), -log10(all1$p.low), all$cmpd, pos = 4, cex = 0.7)
title("Manhattan risk factors 1")
# Add p thresholds
abline(h = -log(0.05), col = "grey")
abline(h = -log(fdrP), col = "red")
# 2nd group: iron, crp, il8, TNFa, adiponectin,leptin, igf1, C-peptide
# Bind covariates of interest (1-3 are just for adjustment)
lm5 <- function(x) lm(Iron ~ x + Fasting_C + Sex + lab, data = meta1)
lm6 <- function(x) lm(Crp_CLRT_05 ~ x + Fasting_C + Sex + lab, data = meta1)
lm7 <- function(x) lm(Il8_CLRT_03 ~ x + Fasting_C + Sex + lab, data = meta1)
lm8 <- function(x) lm(Tnfa_CLRT_03 ~ x + Fasting_C + Sex + lab, data = meta1)
lm9 <- function(x) lm(Adipo_CLRT_04 ~ x + Fasting_C + Sex + lab, data = meta1)
lm10 <- function(x) lm(Leptin_CLRT_04 ~ x + Fasting_C + Sex + lab, data = meta1)
lm11 <- function(x) lm(Igf1_CLRT_02 ~ x + Fasting_C + Sex + lab, data = meta1)
lm12 <- function(x) lm(Cpeptide_CLRT_02 ~ x + Fasting_C + Sex + lab, data = meta1)
fits5 <- apply(ints, 2, lm5) %>% map_df(tidy)
fits6 <- apply(ints, 2, lm6) %>% map_df(tidy)
fits7 <- apply(ints, 2, lm7) %>% map_df(tidy)
fits8 <- apply(ints, 2, lm8) %>% map_df(tidy)
fits9 <- apply(ints, 2, lm9) %>% map_df(tidy)
fits10 <- apply(ints, 2, lm10) %>% map_df(tidy)
fits11 <- apply(ints, 2, lm11) %>% map_df(tidy)
fits12 <- apply(ints, 2, lm12) %>% map_df(tidy)
# Bind covariates of interest (1-3 are just for adjustment)
all <- bind_rows(fits4, fits5, fits6, fits7, fits8, fits9, fits10, fits11, fits12) %>%
filter(term == "x") %>% mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
bind_cols(cmpd = rep(colnames(ints), 9))
# draw empty plot
plot(NULL, xlim=c(0, nrow(all)), ylim=c(0, max(-log10(all$p.value))), xaxt='n',
ylab='-log10(p-value)', xlab='')
points(x, -log10(all$p.value), pch=19, col=set2, cex = 0.6)
labs2 <- c("Alc", "iron", "crp", "il8", "TNFa", "adiponectin","leptin", "igf1", "C-peptide")
# Create plot
axis(1, at = at.labs, labels = labs2, las=1, cex.axis = 0.7)
all1 <- all %>% mutate(p.low = ifelse(-log10(p.value) > 15, p.value, NA))
text(1:nrow(all), -log10(all1$p.low), all$cmpd, pos = 4, cex = 0.7)
title("Manhattan risk factors 2")
# Add p thresholds
abline(h = -log(0.05), col = "grey")
abline(h = -log(fdrP), col = "red")