forked from thelovelab/DESeq2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplots.R
419 lines (381 loc) · 14.9 KB
/
plots.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
plotDispEsts.DESeqDataSet <- function( object, ymin, CV=FALSE,
genecol = "black", fitcol = "red", finalcol = "dodgerblue",
legend=TRUE, xlab, ylab, log = "xy", cex = 0.45, ... )
{
if (missing(xlab)) xlab <- "mean of normalized counts"
if (missing(ylab)) {
if (CV) {
ylab <- "coefficient of variation"
} else {
ylab <- "dispersion"
}
}
px = mcols(object)$baseMean
sel = (px>0)
px = px[sel]
# transformation of dispersion into CV or not
f <- if (CV) sqrt else I
py = f(mcols(object)$dispGeneEst[sel])
if(missing(ymin))
ymin = 10^floor(log10(min(py[py>0], na.rm=TRUE))-0.1)
plot(px, pmax(py, ymin), xlab=xlab, ylab=ylab,
log=log, pch=ifelse(py<ymin, 6, 20), col=genecol, cex=cex, ... )
# use a circle over outliers
pchOutlier <- ifelse(mcols(object)$dispOutlier[sel],1,16)
cexOutlier <- ifelse(mcols(object)$dispOutlier[sel],2*cex,cex)
lwdOutlier <- ifelse(mcols(object)$dispOutlier[sel],2,1)
if (!is.null(dispersions(object))) {
points(px, f(dispersions(object)[sel]), col=finalcol, cex=cexOutlier,
pch=pchOutlier, lwd=lwdOutlier)
}
if (!is.null(mcols(object)$dispFit)) {
points(px, f(mcols(object)$dispFit[sel]), col=fitcol, cex=cex, pch=16)
}
if (legend) {
legend("bottomright",c("gene-est","fitted","final"),pch=16,
col=c(genecol,fitcol,finalcol),bg="white")
}
}
#' Plot dispersion estimates
#'
#' A simple helper function that plots the per-gene dispersion
#' estimates together with the fitted mean-dispersion relationship.
#'
#' @docType methods
#' @name plotDispEsts
#' @rdname plotDispEsts
#' @aliases plotDispEsts plotDispEsts,DESeqDataSet-method
#'
#' @param object a DESeqDataSet, with dispersions estimated
#' @param ymin the lower bound for points on the plot, points beyond this
#' are drawn as triangles at ymin
#' @param CV logical, whether to plot the asymptotic or biological
#' coefficient of variation (the square root of dispersion) on the y-axis.
#' As the mean grows to infinity, the square root of dispersion gives
#' the coefficient of variation for the counts. Default is \code{FALSE},
#' plotting dispersion.
#' @param genecol the color for gene-wise dispersion estimates
#' @param fitcol the color of the fitted estimates
#' @param finalcol the color of the final estimates used for testing
#' @param legend logical, whether to draw a legend
#' @param xlab xlab
#' @param ylab ylab
#' @param log log
#' @param cex cex
#' @param ... further arguments to \code{plot}
#'
#' @author Simon Anders
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet()
#' dds <- estimateSizeFactors(dds)
#' dds <- estimateDispersions(dds)
#' plotDispEsts(dds)
#'
#' @export
setMethod("plotDispEsts", signature(object="DESeqDataSet"), plotDispEsts.DESeqDataSet)
plotMA.DESeqDataSet <- function(object, alpha=.1, main="",
xlab="mean of normalized counts", ylim,
colNonSig="gray60", colSig="blue", colLine="grey40",
returnData=FALSE,
MLE=FALSE, ...) {
res <- results(object, ...)
plotMA.DESeqResults(res, alpha=alpha, main=main, xlab=xlab, ylim=ylim, MLE=MLE)
}
plotMA.DESeqResults <- function(object, alpha, main="",
xlab="mean of normalized counts", ylim,
colNonSig="gray60", colSig="blue", colLine="grey40",
returnData=FALSE,
MLE=FALSE, ...) {
sval <- "svalue" %in% names(object)
if (sval) {
test.col <- "svalue"
} else {
test.col <- "padj"
}
if (MLE) {
if (is.null(object$lfcMLE)) {
stop("lfcMLE column is not present: you should first run results() with addMLE=TRUE")
}
lfc.col <- "lfcMLE"
} else {
lfc.col <- "log2FoldChange"
}
if (missing(alpha)) {
if (sval) {
alpha <- 0.005
message("thresholding s-values on alpha=0.005 to color points")
} else {
if (is.null(metadata(object)$alpha)) {
alpha <- 0.1
} else {
alpha <- metadata(object)$alpha
}
}
}
isDE <- ifelse(is.na(object[[test.col]]), FALSE, object[[test.col]] < alpha)
df <- data.frame(mean = object[["baseMean"]],
lfc = object[[lfc.col]],
isDE = isDE)
if (returnData) {
return(df)
}
if (missing(ylim)) {
plotMA(df,
colNonSig=colNonSig, colSig=colSig, colLine=colLine,
xlab=xlab, main=main, ...)
} else {
plotMA(df, ylim=ylim,
colNonSig=colNonSig, colSig=colSig, colLine=colLine,
xlab=xlab, main=main, ...)
}
}
#' MA-plot from base means and log fold changes
#'
#' A simple helper function that makes a so-called "MA-plot", i.e. a
#' scatter plot of log2 fold changes (on the y-axis) versus the mean of
#' normalized counts (on the x-axis).
#'
#' This function is essentially two lines of code: building a
#' \code{data.frame} and passing this to the \code{plotMA} method
#' for \code{data.frame} from the geneplotter package.
#' The code was modified in version 1.28 to change from red to blue points
#' for better visibility for users with color-blindness. The original plots
#' can still be made via the use of \code{returnData=TRUE} and passing the
#' resulting data.frame directly to \code{geneplotter::plotMA}.
#' The code of this function can be seen with:
#' \code{getMethod("plotMA","DESeqDataSet")}
#' If the \code{object} contains a column \code{svalue} then these
#' will be used for coloring the points (with a default \code{alpha=0.005}).
#'
#' @docType methods
#' @name plotMA
#' @rdname plotMA
#' @aliases plotMA plotMA,DESeqDataSet-method plotMA,DESeqResults-method
#'
#' @param object a \code{DESeqResults} object produced by \code{\link{results}};
#' or a \code{DESeqDataSet} processed by \code{\link{DESeq}}, or the
#' individual functions \code{\link{nbinomWaldTest}} or \code{\link{nbinomLRT}}
#' @param alpha the significance level for thresholding adjusted p-values
#' @param main optional title for the plot
#' @param xlab optional defaults to "mean of normalized counts"
#' @param ylim optional y limits
#' @param colNonSig color to use for non-significant data points
#' @param colSig color to use for significant data points
#' @param colLine color to use for the horizontal (y=0) line
#' @param returnData logical, whether to return the data.frame used for plotting
#' @param MLE if \code{betaPrior=TRUE} was used,
#' whether to plot the MLE (unshrunken estimates), defaults to FALSE.
#' Requires that \code{\link{results}} was run with \code{addMLE=TRUE}.
#' Note that the MLE will be plotted regardless of this argument,
#' if DESeq() was run with \code{betaPrior=FALSE}. See \code{\link{lfcShrink}}
#' for examples on how to plot shrunken log2 fold changes.
#' @param ... further arguments passed to \code{plotMA} if object
#' is \code{DESeqResults} or to \code{\link{results}} if object is
#' \code{DESeqDataSet}
#'
#' @author Michael Love
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet()
#' dds <- DESeq(dds)
#' plotMA(dds)
#' res <- results(dds)
#' plotMA(res)
#'
#' @importFrom geneplotter plotMA
#' @importFrom ggplot2 ggplot geom_point xlab ylab coord_fixed aes_string
#'
#' @export
setMethod("plotMA", signature(object="DESeqDataSet"), plotMA.DESeqDataSet)
#' @name plotMA
#' @rdname plotMA
#' @export
setMethod("plotMA", signature(object="DESeqResults"), plotMA.DESeqResults)
plotPCA.DESeqTransform = function(object, intgroup="condition", ntop=500, returnData=FALSE)
{
# calculate the variance for each gene
rv <- rowVars(assay(object))
# select the ntop genes by variance
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
# perform a PCA on the data in assay(x) for the selected genes
pca <- prcomp(t(assay(object)[select,]))
# the contribution to the total variance for each component
percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(colData(object)[, intgroup, drop=FALSE])
# add the intgroup factors together to create a new grouping factor
group <- if (length(intgroup) > 1) {
factor(apply( intgroup.df, 1, paste, collapse=":"))
} else {
colData(object)[[intgroup]]
}
# assembly the data for the plot
d <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], group=group, intgroup.df, name=colnames(object))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:2]
return(d)
}
ggplot(data=d, aes_string(x="PC1", y="PC2", color="group")) + geom_point(size=3) +
xlab(paste0("PC1: ",round(percentVar[1] * 100),"% variance")) +
ylab(paste0("PC2: ",round(percentVar[2] * 100),"% variance")) +
coord_fixed()
}
#' Sample PCA plot for transformed data
#'
#' This plot helps to check for batch effects and the like.
#'
#' @docType methods
#' @name plotPCA
#' @rdname plotPCA
#' @aliases plotPCA plotPCA,DESeqTransform-method
#'
#' @param object a \code{\link{DESeqTransform}} object, with data in \code{assay(x)},
#' produced for example by either \code{\link{rlog}} or
#' \code{\link{varianceStabilizingTransformation}}.
#' @param intgroup interesting groups: a character vector of
#' names in \code{colData(x)} to use for grouping
#' @param ntop number of top genes to use for principal components,
#' selected by highest row variance
#' @param returnData should the function only return the data.frame of PC1 and PC2
#' with intgroup covariates for custom plotting (default is FALSE)
#'
#' @return An object created by \code{ggplot}, which can be assigned and further customized.
#'
#' @author Wolfgang Huber
#'
#' @note See the vignette for an example of variance stabilization and PCA plots.
#' Note that the source code of \code{plotPCA} is very simple.
#' The source can be found by typing \code{DESeq2:::plotPCA.DESeqTransform}
#' or \code{getMethod("plotPCA","DESeqTransform")}, or
#' browsed on github at \url{https://github.com/mikelove/DESeq2/blob/master/R/plots.R}
#' Users should find it easy to customize this function.
#'
#' @examples
#'
#' # using rlog transformed data:
#' dds <- makeExampleDESeqDataSet(betaSD=1)
#' rld <- rlog(dds)
#' plotPCA(rld)
#'
#' # also possible to perform custom transformation:
#' dds <- estimateSizeFactors(dds)
#' # shifted log of normalized counts
#' se <- SummarizedExperiment(log2(counts(dds, normalized=TRUE) + 1),
#' colData=colData(dds))
#' # the call to DESeqTransform() is needed to
#' # trigger our plotPCA method.
#' plotPCA( DESeqTransform( se ) )
#'
#' @export
setMethod("plotPCA", signature(object="DESeqTransform"), plotPCA.DESeqTransform)
#' Plot of normalized counts for a single gene
#'
#' Normalized counts plus a pseudocount of 0.5 are shown by default.
#'
#' @param dds a \code{DESeqDataSet}
#' @param gene a character, specifying the name of the gene to plot
#' @param intgroup interesting groups: a character vector of names in \code{colData(x)} to use for grouping.
#' Must be factor variables. If you want to plot counts over numeric, choose \code{returnData=TRUE}
#' @param normalized whether the counts should be normalized by size factor
#' (default is TRUE)
#' @param transform whether to have log scale y-axis or not.
#' defaults to TRUE
#' @param main as in 'plot'
#' @param xlab as in 'plot'
#' @param returnData should the function only return the data.frame of counts and
#' covariates for custom plotting (default is FALSE)
#' @param replaced use the outlier-replaced counts if they exist
#' @param pc pseudocount for log transform
#' @param ... arguments passed to plot
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet()
#' plotCounts(dds, "gene1")
#'
#' @export
plotCounts <- function(dds, gene, intgroup="condition",
normalized=TRUE, transform=TRUE,
main, xlab="group",
returnData=FALSE,
replaced=FALSE,
pc, ...) {
stopifnot(length(gene) == 1 & (is.character(gene) | (is.numeric(gene) & (gene >= 1 & gene <= nrow(dds)))))
if (!all(intgroup %in% names(colData(dds)))) stop("all variables in 'intgroup' must be columns of colData")
if (!returnData) {
if (!all(sapply(intgroup, function(v) is(colData(dds)[[v]], "factor")))) {
stop("all variables in 'intgroup' should be factors, or choose returnData=TRUE and plot manually")
}
}
if (missing(pc)) {
pc <- if (transform) 0.5 else 0
}
if (is.null(sizeFactors(dds)) & is.null(normalizationFactors(dds))) {
dds <- estimateSizeFactors(dds)
}
cnts <- counts(dds,normalized=normalized,replaced=replaced)[gene,]
group <- if (length(intgroup) == 1) {
colData(dds)[[intgroup]]
} else if (length(intgroup) == 2) {
lvls <- as.vector(t(outer(levels(colData(dds)[[intgroup[1]]]),
levels(colData(dds)[[intgroup[2]]]),
function(x,y) paste(x,y,sep=":"))))
droplevels(factor(apply( as.data.frame(colData(dds)[, intgroup, drop=FALSE]),
1, paste, collapse=":"), levels=lvls))
} else {
factor(apply( as.data.frame(colData(dds)[, intgroup, drop=FALSE]),
1, paste, collapse=":"))
}
data <- data.frame(count=cnts + pc, group=as.integer(group))
logxy <- if (transform) "y" else ""
if (missing(main)) {
main <- if (is.numeric(gene)) {
rownames(dds)[gene]
} else {
gene
}
}
ylab <- ifelse(normalized,"normalized count","count")
if (returnData) return(data.frame(count=data$count, colData(dds)[intgroup]))
plot(data$group + runif(ncol(dds),-.05,.05), data$count, xlim=c(.5,max(data$group)+.5),
log=logxy, xaxt="n", xlab=xlab, ylab=ylab, main=main, ...)
axis(1, at=seq_along(levels(group)), levels(group))
}
#' Sparsity plot
#'
#' A simple plot of the concentration of counts in a single sample over the
#' sum of counts per gene. Not technically the same as "sparsity", but this
#' plot is useful diagnostic for datasets which might not fit a negative
#' binomial assumption: genes with many zeros and individual very large
#' counts are difficult to model with the negative binomial distribution.
#'
#' @param x a matrix or DESeqDataSet
#' @param normalized whether to normalize the counts from a DESeqDataSEt
#' @param ... passed to \code{plot}
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet(n=1000,m=4,dispMeanRel=function(x) .5)
#' dds <- estimateSizeFactors(dds)
#' plotSparsity(dds)
#'
#' @export
plotSparsity <- function(x, normalized=TRUE, ...) {
if (is(x, "DESeqDataSet")) {
x <- counts(x, normalized=normalized)
}
rs <- rowSums(x)
rmx <- apply(x, 1, max)
plot(rs[rs > 0], (rmx/rs)[rs > 0], log="x", ylim=c(0,1), xlab="sum of counts per gene",
ylab="max count / sum", main="Concentration of counts over total sum of counts", ...)
}
# convenience function for adding alpha transparency to named colors
## col2useful <- function(col,alpha) {
## x <- col2rgb(col)/255
## rgb(x[1],x[2],x[3],alpha)
## }