-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_MutationDistribution.Rmd
336 lines (276 loc) · 11.9 KB
/
01_MutationDistribution.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
---
title: "01 Frequency of DDX3X mutations in BL"
author: "Joanna A. Krupka"
date: "01 July 2019"
output:
html_document:
theme: cosmo
code_folding: show
---
## Background
The aim of the analysis is to compute the frequency of DDX3X mutations across different studies
## Objectives
1. Compute the frequency of mutations in targeted sequencing of 39 BL patients
2. Compare the frequency of DDX3X mutation between already published studies
3. Examine the distribution of DDX3X mutations in different cancer types (COSMIC data)
4. Generate a lollipop plot showing the density of DDX3X mutation across conserved DDX3X domains
comparing BL and DLBCL
## Materials and methods
```{r eval = T, echo = T, message = F, warning = F, error = F}
library(tidyverse)
library(knitr)
library(readxl)
library(ggpubr)
library(AnnotationHub)
library(EnsDb.Hsapiens.v86)
library(trackViewer)
source("../utilis.R")
DXTX <- "ENST00000644876" # Reference DDX3X transcript
genes <- c("ENSG00000215301", "ENSG00000067048") # DDX3X and DDX3Y ID
BL_drivers <- c("MYC", "ID3", "TP53", "CCND3", "DDX3X", "ARID1A", "FOXO1",
"SMARCA4", "TET2", "TCF3", "BCL7A", "FBXO11", "GNA13", "PTEN",
"HIST1H1E")
# Results of the targeted panel sequencing
targeted.panel <- read_excel("../utilis/SupplTable1.xlsx",
sheet = 1, skip = 1)
# Mutations from other studies
#DXmut_integrated <-read_csv("data/01_Supplementary_Table.csv")
# DDX3X mutations from different BL and DLBCL studies
studies <- read_excel("../utilis/SupplTable1.xlsx",
sheet = 3, skip = 1)
```
## Analysis
### Targeted sequencing panel of 39 BL patients
```{r eval = T, echo = T, message = F, warning = F, error = F}
# Compute the frequency of mutation in each gene
targeted.panel.freq <- targeted.panel %>%
group_by(Gene) %>%
summarise(Freq = 100*length(unique(Pt_ID))/39) %>%
arrange(desc(Freq)) %>%
mutate(fill = Gene == "DDX3X",
Gene = factor(Gene, levels = BL_drivers)) %>%
dplyr::filter(Gene %in% BL_drivers)
# Plot the frequency of BL driver genes
ggplot(targeted.panel.freq, aes(x = Gene, y = Freq, fill = fill)) +
geom_bar(stat = "identity", width = 0.5) +
theme_pubr() +
scale_fill_manual(values = divergingPal[c(10,2)]) +
scale_y_continuous(limits = c(0,70), breaks = seq(0,70,by = 10), labels = seq(0,70,by = 10)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggsave("plots/Targeted_panel_BLdrivers.pdf", width = 7, height = 6)
```
### Distribution of DDX3X mutations in different cancer types
```{r eval = F, echo = F, message = F, warning = F, error = F}
# DDX3X muattions collected from COSMIC (accession date 02/07/2019)
ddx3x_cosmic <- read_csv("../utilis/cosmic_files/DDX3X_COSMIC_02072019.csv")
## ICGC
icgc_project <- read_csv("../utilis/icgc_files/icgc_COSMIC_names.csv")
## Functions: agregate icgc tables and filter on geneID
aggregateICGC <- function(toAggregate, filter_mut = F, genes = NULL){
require(tidyverse)
full_tab <- data.frame()
#Filter only
if(filter_mut){
for (i in 1:length(toAggregate)){
tab <- read_delim(toAggregate[i], delim = "\t") %>%
dplyr::filter(gene_affected %in% genes)
full_tab <- rbind(full_tab, tab)
}
} else {
for (i in 1:length(toAggregate)){
tab <- read_delim(toAggregate[i], delim = "\t")
full_tab <- rbind(full_tab, tab)
}
}
return(full_tab)
}
donorsTab <- aggregateICGC(list.files("../utilis/icgc_files", "donor", full.names = T)) %>%
group_by(project_code) %>%
mutate(COSMIC = icgc_project$COSMIC[match(project_code, icgc_project$project)])
mutTab <- aggregateICGC(list.files("icgc_files", "../utilis/somatic_mutation", full.names = T), T, genes)
save(donorsTab, mutTab, ddx3x_cosmic, file = "mutationsData.RData")
```
```{r eval = T, echo = T, message = F, warning = F, error = F}
# DDX3X Mutations tissue distribution
load("data/mutationsData.RData")
## ICGC (downloaded with run_icgcDownload.py)
ddx3x_icgc_filtered <- mutTab %>%
dplyr::filter(gene_affected == genes[1] &
!is.na(aa_mutation) &
consequence_type != "synonymous_variant") %>%
group_by(icgc_donor_id) %>%
mutate(COSMIC = donorsTab$COSMIC[match(icgc_donor_id, donorsTab$icgc_donor_id)],
Source = "ICGC",
chromosome = "X") %>%
ungroup() %>%
dplyr::select(Source, icgc_donor_id, COSMIC, chromosome,
chromosome_start, chromosome_end, aa_mutation,
cds_mutation, consequence_type)
## COSMIC
ddx3x_cosmic_filtered <- ddx3x_cosmic %>%
mutate(Source = "COSMIC",
chromosome = "X",
`AA Mutation` = splitvec(`AA Mutation`, "[.]", 2),
consequence_type = "Unknown",
Primary_tissue = gsub("NS", "Other", `Primary Tissue`)) %>%
separate(`Genomic Co-ordinates`, c(NA, "condensed"), sep = ":") %>%
separate(condensed, c("chromosome_start", NA, "chromosome_end"), sep = "[.]") %>%
dplyr::select(Source, `Sample ID`, Primary_tissue, chromosome,
chromosome_start, chromosome_end, `AA Mutation`,
`CDS Mutation`, consequence_type)
#Integrate
colnames <- c("Source", "Sample_ID", "Primary_tissue", "Chromosome",
"Chromosome_start", "Chromosome_End", "AA_Mutation",
"CDS_Mutation", "Consequence_init")
colnames(ddx3x_cosmic_filtered) <- colnames(ddx3x_icgc_filtered) <- colnames
ddx3x_integrated <- rbind(ddx3x_cosmic_filtered, ddx3x_icgc_filtered)
ddx3x_integrated <- ddx3x_integrated %>%
group_by(AA_Mutation) %>%
mutate(Consequence = case_when(
grepl("fs", AA_Mutation) ~ "Frameshift",
grepl("[*]", AA_Mutation) ~ "Nonsense",
grepl("del", AA_Mutation) ~ "Nonsense",
grepl("[?]", AA_Mutation) ~ "Unclassified",
TRUE ~ "Missense"))
#Reorder
orderOrigin <- tableOrder(ddx3x_integrated$Primary_tissue)
orderMut <- c("Missense", "Nonsense", "Frameshift", "Unclassified")
ddx3x_integrated <- ddx3x_integrated %>%
mutate(Primary_tissue = factor(Primary_tissue, levels = orderOrigin),
Consequence = factor(Consequence, levels = orderMut))
write_csv(ddx3x_integrated, "DDX3Xmut_COSMIC_ICGC_nonsynonymous.csv")
# Total number of mutations
palette <- divergingPal[c(11,10,9,8)]
ggplot(ddx3x_integrated, aes(x = Primary_tissue, y = ..count.., fill = Consequence)) +
geom_bar() +
nature_barplot() +
coord_flip() +
scale_fill_manual(values = palette) +
labs(x = "Primary tissue", y = "Count", fill = "Mutation type")
ggsave("plots/DX_mutDistibution_totalCount.pdf", height = 6, width = 10)
#Ratio
ggplot(ddx3x_integrated, aes(x = Primary_tissue, y = ..count.., fill = Consequence)) +
geom_bar(position = "fill") +
nature_barplot() +
coord_flip() +
scale_fill_manual(values = palette) +
labs(x = "Primary tissue", y = "Count", fill = "Mutation type")
ggsave("plots/DX_mutDistibution_ratio.pdf", height = 6, width = 10)
```
### Distribution of DDX3X mutations over functional domains and motifs
```{r eval = T, echo = T, message = F, warning = F, error = F}
# DDX3X mutations from BL papers
ddx3x_regions <- read_csv("data/DDX3X_regions.csv")
#Convert hg19 coordinates to hg38 coordinates
DXmut_hg19 <- studies %>%
dplyr::filter(Hg == "hg19")
hg19_mut <- GRanges("chrX", IRanges(start=DXmut_hg19$Start, width=1))
#Get Hg19 -> Hg38 annotation chain
ahub <- AnnotationHub()
ahub.chain <- subset(ahub, rdataclass == "ChainFile" & species == "Homo sapiens")
chain <- ahub.chain[ahub.chain$title == "hg19ToHg38.over.chain.gz"]
chain <- chain[[1]]
hg38_mut <- liftOver(hg19_mut, chain)
DXmut_hg38 <- DXmut_hg19 %>%
mutate(Hg = "hg38",
Start = as.numeric(start(hg38_mut))) %>%
full_join(studies) %>%
dplyr::filter(Hg == "hg38")
mutations_gr <- GRanges(seqnames = "X",
IRanges(start = DXmut_hg38$Start, width = 1))
# Map to protein coordinates
edbx <- ensembldb::filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
gnm_pr <- genomeToProtein(mutations_gr, edbx)
exonsDf <- ddx3x_regions %>%
dplyr::filter(`function` == "exon") %>%
dplyr::select(name, cds_start, cds_end, genomic_start, genomic_end) %>%
gather(class, position, -name) %>%
as.data.frame() %>%
separate(class, c("level", "end")) %>%
spread(level, position)
PositionsG <- as.numeric(exonsDf$genomic)
PositionsC <- as.numeric(exonsDf$cds)
gnm_pr_unList <- unlist(gnm_pr)
gnm_pr_df <- as.data.frame(gnm_pr)
gnm_pr_df <- gnm_pr_df[,-9]
gnm_pr_df <- gnm_pr_df %>%
mutate(tx_id = elementMetadata(gnm_pr_unList)$tx_id,
cds_ok = elementMetadata(gnm_pr_unList)$cds_ok,
exon_id = elementMetadata(gnm_pr_unList)$exon_id,
exon_rank = elementMetadata(gnm_pr_unList)$exon_rank,
seq_start = elementMetadata(gnm_pr_unList)$seq_start) %>%
dplyr::select(-group_name) %>%
dplyr::filter(tx_id == "ENST00000629496" | is.na(exon_id)) %>%
mutate(mutStatus = DXmut_hg38$Type,
Disease = DXmut_hg38$Lymphoma,
Label = DXmut_hg38$AA_change,
CDS_start = as.numeric(`start`)) %>%
rownames_to_column("ID") %>%
group_by(`ID`) %>%
mutate(PositionCDS = case_when(`CDS_start` > 0 ~ `CDS_start`,
`CDS_start` < 0 ~ PositionsC[which(abs(`seq_start` - PositionsG) ==
min(abs(`seq_start` - `PositionsG`)))][1]))
# Full, summarized mutations table
gnm_pr_df_summary <- gnm_pr_df %>%
group_by(PositionCDS, mutStatus, Disease) %>%
summarize(Score = length(PositionCDS)) %>%
dplyr::filter(!is.na(mutStatus)) %>%
mutate(Label = case_when(Score > 1 ~ as.character(PositionCDS),
TRUE ~ ""))
# Lolipop plot
## Data
mut_loc <- GRanges("chrX", IRanges(start=gnm_pr_df_summary$PositionCDS, width=1))
mut_types <- factor(gnm_pr_df_summary$mutStatus)
# Mutations: colors
colors <- divergingPal[c(3,1,10,2,11)]
names(colors) <- levels(mut_types)
mut_loc$color <- colors[mut_types]
mut_loc$score <- gnm_pr_df_summary$Score*99
mut_loc$disease <- gnm_pr_df_summary$Disease
# Mutations: legend
legends <- list(list(labels=levels(mut_types), fill=colors))
BL_mut <- mut_loc[mut_loc$disease == "BL"]
BL_mut$SNPsideID <- "top"
DLBCL_mut <- mut_loc[mut_loc$disease == "DLBCL"]
DLBCL_mut$SNPsideID <- "bottom"
all_mut <- c(BL_mut, DLBCL_mut)
all_mut$lwd <- .2
all_mut$size <- .05
# DDX3X core: regions
core <- GRanges("chrX", IRanges(start=1, end=662))
core$fill <- "grey"
core$height <- 0.01
# DDX3X regioins & motifs
## Helicse domain
select <- ddx3x_regions$category == "domain"
helicase <- GRanges("chrX", IRanges(start=ddx3x_regions$cds_start[select],
end=ddx3x_regions$cds_end[select],
names = factor(ddx3x_regions$`function`[select])))
helicase$fill <- divergingPal[c(8,9)]
helicase$height <- 0.03
## Motifs
select <- ddx3x_regions$category == "motif"
motifs <- GRanges("chrX", IRanges(start=ddx3x_regions$cds_start[select],
end=ddx3x_regions$cds_end[select],
names = factor(ddx3x_regions$`function`[select])))
colors <- divergingPal[c(11,10)]
names(colors) <- unique(names(motifs))
motifs$fill <- colors[names(motifs)]
motifs$height <- 0.03
pdf('plots/Lolipop_BL_DLBCL_new.pdf', useDingbats =F)
lolliplot(list(all_mut)[[1]],
#features = c(core, helicase, motifs),
features = c(core, helicase),
xaxis = TRUE,
legend = legends,
yaxis=TRUE, cex=.4,
jitter = "label")
dev.off()
```
## Conclusions
1. The most frequently mutated genes found in the targeted sequencing panel of
39 Burkitt Lymphoma cases were MYC, ID3, TP53, CCND3, DDX3X, ARID1A, FOXO1 and SMARCA4
2. The frequency of DDX3X mutation was consistent with previous studies
3. Almost all mutations in BL and DLBCL patients were clustered in C-terminal helicase domain
4. In contrast to medulloblastoma, another cancer with frequent DDX3X mutations, lymphoma mutations
were frequently disruptive (nonsense or frameshift).