forked from microbiome/mia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransformCounts.R
780 lines (759 loc) · 32.1 KB
/
transformCounts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
#' Transform assay
#'
#' Variety of transformations for abundance data, stored in \code{assay}.
#' See details for options.
#'
#' @inheritParams getDominant
#' @inheritParams getDissimilarity
#'
#' @param method \code{Character scalar}. Specifies the transformation
#' method.
#'
#' @param MARGIN \code{Character scalar}. Determines whether the
#' transformation is applied sample (column) or feature (row) wise.
#' (Default: \code{"samples"})
#'
#' @param pseudocount \code{Logical scalar} or \code{numeric scalar}.
#' When \code{TRUE}, automatically adds half of the minimum positive
#' value of \code{assay.type} (missing values ignored by default:
#' \code{na.rm = TRUE}).
#' When FALSE, does not add any pseudocount (pseudocount = 0).
#' Alternatively, a user-specified numeric value can be added as pseudocount.
#' (Default: \code{FALSE}).
#'
#' @param name \code{Character scalar}. The name for the transformed assay to
#' be stored. (Default: \code{method})
#'
#' @param altexp \code{Character vector} or \code{NULL}. Specifies the names
#' of alternative experiments to which the transformation should also be
#' applied. If \code{NULL}, the transformation is only applied to the main
#' experiment. (Default: \code{NULL}).
#'
#' @param ... additional arguments passed e.g. on to \code{vegan:decostand}
#' or \code{philr::philr}.
#' \itemize{
#' \item \code{reference}: \code{Character scalar}. Used to
#' to fill reference sample's column in returned assay when calculating alr.
#' (Default: \code{NA})
#' \item \code{ref_vals} Deprecated. Use \code{reference} instead.
#' \item \code{percentile}: \code{Numeric scalar} or \code{NULL} (css). Used
#' to set the percentile value that calculates the scaling factors in the css
#' normalization. If \code{NULL}, percentile is estimated from the data by
#' calculating the portion of samples that exceed the \code{threshold}.
#' (Default: \code{NULL})
#' \item \code{scaling}: \code{Numeric scalar}. Adjusts the normalization
#' scale by dividing the calculated scaling factors, effectively changing
#' the magnitude of the normalized counts. (Default: \code{1000}).
#' \item \code{threshold}: \code{Numeric scalar}. Specifies relative
#' difference threshold and determines the first point where the relative
#' change in differences between consecutive quantiles exceeds this
#' threshold. (Default: \code{0.1}).
#' \item \code{tree}: \code{phylo}. Phylogeny used in PhILR transformation.
#' If \code{NULL}, the tree is retrieved from \code{x}.
#' (Default: \code{NULL}).
#' \item \code{node.labels}: \code{Character vector}. Linkages between
#' \code{tree} and \code{x}. Used in PhILR transformation.
#' (Default: \code{NULL}).
#' }
#' @details
#'
#' \code{transformAssay} function provides a variety of options for
#' transforming abundance data. The transformed data is calculated and stored
#' in a new \code{assay}.
#'
#' The \code{transformAssay} provides sample-wise (column-wise) or feature-wise
#' (row-wise) transformation to the abundance table
#' (assay) based on specified \code{MARGIN}.
#'
#' The available transformation methods include:
#'
#' \itemize{
#'
#' \item 'alr', 'chi.square', 'clr', 'frequency', 'hellinger', 'log',
#' 'normalize', 'pa', 'rank', 'rclr' relabundance', 'rrank', 'standardize',
#' 'total': please refer to
#' \code{\link[vegan:decostand]{decostand}} for details.
#'
#' \item 'philr': please refer to \code{\link[philr:philr]{philr}} for details.
#'
#' \item 'css': Cumulative Sum Scaling (CSS) can be used to normalize count data
#' by accounting for differences in library sizes. By default, the function
#' determines the normalization percentile for summing and scaling
#' counts. If you want to specify the percentile value, good default value
#' might be \code{0.5}. The method is inspired by the CSS methods in
#' \code{\link[https://www.bioconductor.org/packages/metagenomeSeq/]{metagenomeSeq}}
#' package.
#'
#' \item 'log10': log10 transformation can be used for reducing the skewness
#' of the data.
#' \deqn{log10 = \log_{10} x}{%
#' log10 = log10(x)}
#' where \eqn{x} is a single value of data.
#'
#' \item 'log2': log2 transformation can be used for reducing the skewness of
#' the data.
#' \deqn{log2 = \log_{2} x}{%
#' log2 = log2(x)}
#' where \eqn{x} is a single value of data.
#'
#' }
#'
#' @return
#' \code{transformAssay} returns the input object \code{x}, with a new
#' transformed abundance table named \code{name} added in the
#' \code{\link[SummarizedExperiment:assays]{assays}}.
#'
#' @references
#'
#' Paulson, J., Stine, O., Bravo, H. et al. (2013)
#' Differential abundance analysis for microbial marker-gene surveys
#' _Nature Methods_ 10, 1200–1202.
#' doi:10.1038/nmeth.2658
#'
#' @seealso
#' \itemize{
#' \item \code{\link[vegan:decostand]{vegan::decostand}}
#' \item \code{\link[philr:philr]{philr::philr}}
#' }
#'
#' @name transformAssay
#' @export
#'
#' @examples
#' data(GlobalPatterns)
#' tse <- GlobalPatterns
#'
#' # By specifying 'method', it is possible to apply different transformations,
#' # e.g. compositional transformation.
#' tse <- transformAssay(tse, method = "relabundance")
#'
#' # The target of transformation can be specified with "assay.type"
#' # Pseudocount can be added by specifying 'pseudocount'.
#'
#' # Perform CLR with half of the smallest positive value as pseudocount
#' tse <- transformAssay(
#' tse, assay.type = "counts", method = "clr",
#' pseudocount = TRUE
#' )
#'
#' head(assay(tse, "clr"))
#'
#' # Perform CSS normalization.
#' tse <- transformAssay(tse, method = "css")
#' head(assay(tse, "css"))
#'
#' # With MARGIN, you can specify the if transformation is done for samples or
#' # for features. Here Z-transformation is done feature-wise.
#' tse <- transformAssay(tse, method = "standardize", MARGIN = "features")
#' head(assay(tse, "standardize"))
#'
#' # Name of the stored table can be specified.
#' tse <- transformAssay(tse, method="hellinger", name="test")
#' head(assay(tse, "test"))
#'
#' # pa returns presence absence table.
#' tse <- transformAssay(tse, method = "pa")
#' head(assay(tse, "pa"))
#'
#' # rank returns ranks of taxa.
#' tse <- transformAssay(tse, method = "rank")
#' head(assay(tse, "rank"))
#'
#' # In order to use other ranking variants, modify the chosen assay directly:
#' assay(tse, "rank_average", withDimnames = FALSE) <- colRanks(
#' assay(tse, "counts"), ties.method = "average", preserveShape = TRUE)
#'
#' # Using altexp parameter. First agglomerate the data and then apply
#' # transformation.
#' tse <- GlobalPatterns
#' tse <- agglomerateByRanks(tse)
#' tse <- transformAssay(
#' tse, method = "relabundance", altexp = altExpNames(tse))
#' # The transformation is applied to all alternative experiments
#' altExp(tse, "Species")
#'
#' \dontrun{
#' # philr transformation can be applied if the philr package is installed.
#' # Subset data b taking only prevalent taxa
#' tse <- subsetByPrevalent(tse)
#' # Apply transformation
#' tse <- transformAssay(tse, method = "philr", pseudocount = 1, MARGIN = 1L)
#' # The transformed data is added to altExp
#' altExp(tse, "philr")
#' }
#'
NULL
#' @rdname transformAssay
#' @export
setMethod("transformAssay", signature = c(x = "SummarizedExperiment"),
function(x,
assay.type = "counts", assay_name = NULL,
method = c("alr", "chi.square", "clr", "css", "frequency",
"hellinger", "log", "log10", "log2", "max", "normalize",
"pa", "philr", "range", "rank", "rclr", "relabundance", "rrank",
"standardize", "total", "z"),
MARGIN = "samples",
name = method,
pseudocount = FALSE,
...){
#
x <- .transform_assay(
x = x, method = method, name = name, assay.type = assay.type,
MARGIN = MARGIN, pseudocount = pseudocount, ...)
return(x)
}
)
#' @rdname transformAssay
#' @export
setMethod("transformAssay", signature = c(x = "SingleCellExperiment"),
function(x, altexp = NULL, ...){
# Check altexp
if( !(is.null(altexp) || all(altexp %in% altExpNames(x)) ||
(.is_integer(altexp) &&
all(altexp<=length(altExps(x)) & altexp>0)) ) ){
stop("'altexp' should be NULL or specify names from ",
"altExpNames(x).", call. = FALSE)
}
#
# Transform the main object
x <- .transform_assay(x, ...)
# Transform alternative experiments
altExps(x)[altexp] <- lapply(altExps(x)[altexp], function(y){
# Give informative error message that states that the error
# happened during processing alternative experiments.
tryCatch({.transform_assay(y, ...)},
error = function(e){
stop("Transforming altExps: ", conditionMessage(e),
call. = FALSE)
})
})
return(x)
}
)
########################### HELP FUNCTIONS #####################################
############################### .transform_assay ###############################
# A generic functon that takes SE as input and transforms the specified assay
# with specified method. By calling this generic function, we can apply same
# methods for SE and altExps of TreeSE.
.transform_assay <- function(
x, assay.type = "counts", assay_name = NULL,
method = c(
"alr", "chi.square", "clr", "css", "frequency",
"hellinger", "log", "log10", "log2", "max", "normalize",
"pa", "philr", "range", "rank", "rclr", "relabundance", "rrank",
"standardize", "total", "z"),
MARGIN = "samples",
name = method,
pseudocount = FALSE,
...){
# Input check
if(!is.null(assay_name)){
.Deprecated(old="assay_name", new="assay.type", "Now assay_name is
deprecated. Use assay.type instead.")
assay.type <- assay_name
}
# Check assay.type
.check_assay_present(assay.type, x)
# Check name
if(!.is_non_empty_string(name) || name == assay.type){
stop("'name' must be a non-empty single character value and be ",
"different from `assay.type`.", call. = FALSE)
}
# If method is not single string, user has not specified transform method,
# or has given e.g. a vector
if(!.is_non_empty_string(method)){
stop("'method' must be a non-empty single character value.",
call. = FALSE)
}
method <- match.arg(method, several.ok = FALSE)
# Check that MARGIN is 1 or 2
MARGIN <- .check_MARGIN(MARGIN)
# Check pseudocount
if( !.is_a_bool(pseudocount) && !(is.numeric(pseudocount) &&
length(pseudocount) == 1 && pseudocount >= 0) ){
stop("'pseudocount' must be TRUE, FALSE or a number equal to or ",
"greater than 0.", call. = FALSE)
}
# Input check end
# Get the method and abundance table
method <- match.arg(method)
assay <- assay(x, assay.type)
# Apply pseudocount, if it is not 0 or FALSE
assay <- .apply_pseudocount(assay, pseudocount, ...)
# Store pseudocount value and set attr equal to NULL. The function above,
# add the used pseudocount to attributes.
pseudocount <- attr(assay, "pseudocount")
attr(assay, "pseudocount") <- NULL
# Calls help function that does the transformation
# Help function is different for mia and vegan transformations
if( method %in% c("log10", "log2", "css") ){
transformed_table <- .apply_transformation(
assay, method, MARGIN, ...)
} else if( method %in% c("philr") ){
transformed_table <- .apply_transformation_from_philr(
assay, method, MARGIN, x = x, ...)
} else {
transformed_table <- .apply_transformation_from_vegan(
assay, method, MARGIN, ...)
}
# Add pseudocount info to transformed table
attr(transformed_table, "parameters")$pseudocount <- pseudocount
# Add transformed table back to original TreeSE
x <- .add_transformed_data(x, transformed_table, name)
return(x)
}
##############################.apply_transformation#############################
# Help function for transformAssay, takes abundance table
# as input and returns transformed table. This function utilizes mia's
# transformation functions.
.apply_transformation <- function(assay, method, MARGIN, ...){
# Transpose if MARGIN is row
if( MARGIN == 1L ){
assay <- t(assay)
}
# Function is selected based on the "method" variable
FUN <- switch(
method,
log10 = .calc_log,
log2 = .calc_log,
css = .calc_css
)
# Get transformed table
transformed_table <- do.call(
FUN, list(mat = assay, method = method, MARGIN = MARGIN, ...) )
# Transpose back to normal if MARGIN is row
if( MARGIN == 1L ){
transformed_table <- t(transformed_table)
}
# Add method and margin to attributes
attr(transformed_table, "mia") <- method
attr(transformed_table, "parameters")$margin <- MARGIN
return(transformed_table)
}
########################.apply_transformation_from_vegan########################
# Help function for transformAssay, takes abundance
# table as input and returns transformed table. This function utilizes vegan's
# transformation functions.
#' @importFrom vegan decostand
.apply_transformation_from_vegan <- function(
mat, method, MARGIN, reference = ref_vals, ref_vals = NA, ...){
# Input check
# Check reference
if( length(reference) != 1 ){
stop("'reference' must be a single value specifying the ",
"values of the reference sample.", call. = FALSE)
}
# Input check end
# Ensure that the matrix has proper dimnames
if (is.null(rownames(mat))) {
rownames(mat) <- paste0("feature", seq_len(nrow(mat)))
}
if (is.null(colnames(mat))) {
colnames(mat) <- paste0("sample", seq_len(ncol(mat)))
}
# Adjust method if mia-specific alias was used
method <- ifelse(method == "relabundance", "total", method)
if (method == "z") {
.Deprecated(old="z", new="standardize")
}
method <- ifelse(method == "z", "standardize", method)
# If method is ALR, vegan drops one column/sample, because it is used
# as a reference. To work with TreeSE, reference sample must be added back.
# Get the original order of samples/features
orig_dimnames <- dimnames(mat)
# Call vegan::decostand and apply transformation
transformed_table <- decostand(mat, method = method, MARGIN = MARGIN, ...)
# Add reference sample back if ALR
if( method %in% c("alr") ){
transformed_table <- .adjust_alr_table(
mat = transformed_table, orig_dimnames = orig_dimnames,
reference = reference)
}
# If table is transposed (like in chi.square), transpose back
if(identical(rownames(transformed_table), colnames(mat)) &&
identical(colnames(transformed_table), rownames(mat)) &&
ncol(transformed_table) != ncol(mat) &&
nrow(transformed_table != nrow(mat))){
transformed_table <- t(transformed_table)
}
return(transformed_table)
}
####################################.calc_log###################################
# This function applies log transformation to abundance table.
.calc_log <- function(mat, method, ...){
# If abundance table contains zeros or negative values, gives an error,
# because it is not possible to calculate log from zeros. Otherwise,
# calculates log.
if ( any(mat < 0, na.rm = TRUE) ){
stop("The assay contains negative values and ", method,
" transformation is being applied without pseudocount.",
"`pseudocount` must be specified manually.", call. = FALSE)
} else if ( any(mat == 0, na.rm = TRUE) ){
stop("The assay contains zeroes and ", method,
" transformation is being applied without pseudocount.",
"`pseudocount` must be set to TRUE.", call. = FALSE)
}
# Calculate log2 or log10 abundances
if(method == "log2"){
mat <- log2(mat)
} else{
mat <- log10(mat)
}
# Add parameter to attributes
attr(mat, "parameters") <- list()
return(mat)
}
################################### .calc_css ##################################
# This function applies cumulative sum scaling (CSS) to the abundance table.
.calc_css <- function(mat, percentile = NULL, scaling = 1000, ...) {
# Input check
if( !(is.null(percentile) || .is_a_numeric(percentile)) ){
stop("'percentile' must be a numeric value or NULL.", call. = FALSE)
}
if( !.is_a_numeric(scaling) ){
stop("'scaling' must be an integer value.", call. = FALSE)
}
#
# If 'percentile' is not provided, calculate it dynamically based on the
# abundance matrix. This allows flexibility in the function by determining
# where the low-abundance features start to diverge,
# which is crucial for CSS normalization.
if( is.null(percentile) ){
percentile <- .calc_css_percentile(mat, ...)
message("'percentile' set to: ", percentile)
}
# Calculate the scaling factors for each sample based on the provided or
# calculated percentile. These factors determine how much of the data
# (below the percentile threshold) is used to normalize each sample.
scaling_factors <- .calc_scaling_factors(mat, percentile)
# Normalize the count data by dividing each sample's counts by its
# corresponding scaling factor. This step adjusts for varying distributions
# of low-abundance features across samples,
# making the counts more comparable by removing biases from varying library
# sizes or sample depth.
scaled_factors <- scaling_factors / scaling
normalized_data <- sweep(mat, 2, scaled_factors, "/")
# Add information on used parameters to normalized matrix
attr(normalized_data, "parameters") <- list(
percentile = percentile,
scaling_factors = scaling_factors,
scaling = scaling
)
return(normalized_data)
}
############################# .calc_css_percentile #############################
# Calculates the cumulative sum scaling (CSS) percentile from the input data.
# The percentile determines the point at which a sample's abundance values start
# to deviate significantly from the average abundance profile of all samples.
#' @importFrom DelayedMatrixStats colSums2 colQuantiles rowMeans2 rowMedians
.calc_css_percentile <- function(mat, threshold = 0.1, ...) {
# Input check
if( !(.is_a_numeric(threshold) && threshold > 0 && threshold < 1) ){
stop("'threshold' must be a numeric value between 0 and 1.",
call. = FALSE)
}
#
# Replace zero values to NA, i.e. not detected. That ensures that they are
# not included in the percentile calculation; only non-zero values are.
mat[ mat == 0 ] <- NA
# Calculate number of features detected
found_features <- colSums2(!is.na(mat))
# Stop if there are columns with only 1 or 0 found features. The percentile
# estimation is not reliable with so few features.
if( any(found_features <= 1) ){
stop(
"There are samples that contain 1 or less found features.",
"'percentile' cannot be estimated from the data. Specify it ",
"manually.", call. = FALSE)
}
# Calculate quantiles for the features present in each sample.
# Use a 'probs' sequence from 0 to 1, with increments based on the maximum
# number of features found. This allows quantile calculation for every
# feature increment.
# The quantiles gives us a detailed view of the abundance distribution in
# each sample, helping to locate low-abundance regions.
quantiles <- colQuantiles(
mat, probs = seq(0, 1, length.out = max(found_features)), na.rm = TRUE)
quantiles <- t(quantiles)
# Sort the columns based on abundance. This means that each sample is sorted
# so that smallest values come first. Ultimately, this means that values
# should increase from first to row to last. Rows do not represent single
# taxa anymore but abundance level.
# This helps compare the accumulation of low-abundance features across
# samples.
mat <- as.matrix(mat)
mat <- apply(mat, 2, function(col){
col[ is.na(col) ] <- 0
col <- sort(col)
return(col)
})
# Compute a reference profile as the average abundance at each abundance
# level across samples. This creates a baseline for comparison, representing
# the "average" behavior of abundance in the dataset.
ref <- rowMeans2(mat)
ref <- ref[ ref > 0 ]
# Calculate the differences between the reference profile and each sample's
# quantiles. This allows us to detect how much a sample's abundance deviates
# from the average abundance profile.
difference <- ref - quantiles
# Calculate the median absolute differences between the reference and sample
# profiles. This helps to quantify the typical deviation in abundance
# accumulation across samples.
difference <- rowMedians(abs(difference))
# Compute the relative change in absolute differences between the reference
# profile and each sample's quantiles. Identify the first index where this
# relative change exceeds the specified threshold. This index represents
# the count value at which the deviation between sample abundance levels and
# the reference profile becomes significant, highlighting a notable
# difference in abundance between samples.
res <- abs(diff(difference)) / difference[-1]
res <- which(res > threshold)
res <- res[[1]]
# Convert the count value to a relative proportion by dividing by the
# maximum number of features. This scales the count to a proportion of the
# total feature count, normalizing it against the library size.
res <- res/max(found_features)
return(res)
}
############################### .calc_scaling_factors ##########################
# Calculates the cumulative sum scaling (CSS) factors for each sample based on
# the given percentile. The scaling factors determine the cumulative sum of
# counts below the percentile for each sample, which is used for normalizing
# the count data.
#' @importFrom DelayedMatrixStats colQuantiles
.calc_scaling_factors <- function(mat, percentile) {
# Replace zero values in the matrix with NA to indicate missing or not
# detected values.
mat_tmp <- mat
mat_tmp[mat_tmp == 0] <- NA
# Calculate the abundance threshold for each sample at the specified
# percentile. This threshold indicates the abundance value below which the
# given proportion of counts falls in the sample.
quantiles <- colQuantiles(mat_tmp, probs = percentile, na.rm = TRUE)
# For each sample, sum the counts that are less than or equal to the
# percentile threshold. This cumulative sum is used as the scaling factor,
# adjusting the sample's abundance to account for low-abundance features.
# Improve precision by subtracting the smallest positive floating-point
# number from each value before comparison.
scaling_factors <- vapply(seq_len(ncol(mat_tmp)), function(i){
col_values <- mat[, i] - .Machine$double.eps
sum(col_values[col_values <= quantiles[i]], na.rm = TRUE)
}, numeric(1))
# Give scaling_factors names corresponding to the original matrix columns
names(scaling_factors) <- colnames(mat)
return(scaling_factors)
}
#################################.calc_rel_abund################################
# This function is for other functions to use internally.
.calc_rel_abund <- function(mat, ...){
mat <- .apply_transformation_from_vegan(
mat, method = "relabundance", MARGIN = 2)
return(mat)
}
###############################.adjust_alr_table################################
# vegan::decostand returns ALR transformed abundance table without reference
# sample. Because in TreeSE all assays must have same row and column names,
# the reference sample is assigned back to transformed abundance table.
.adjust_alr_table <- function(mat, orig_dimnames, reference){
# Store attributes
attributes <- attributes(mat)
# Get original and current sample/feature names and dimensions of reference
# based on what dimensions misses names
MARGIN <- ifelse(length(orig_dimnames[[1]]) == nrow(mat), 2, 1)
orig_names <- if(MARGIN == 1){orig_dimnames[[1]]} else {orig_dimnames[[2]]}
current_names <- if(MARGIN == 1){rownames(mat)} else {colnames(mat)}
nrow <- ifelse(MARGIN == 1, 1, nrow(mat))
ncol <- ifelse(MARGIN == 1, ncol(mat), 1)
# Get the name of reference sample/feature and names of other dimension
reference_name <- setdiff(orig_names, current_names)
var_names <- if(MARGIN == 1){colnames(mat)} else {rownames(mat)}
if(MARGIN == 1){
ref_dimnames <- list(reference_name, var_names)
} else {
ref_dimnames <- list(var_names, reference_name)
}
# Reference sample as NAs or with symbols that are specified by user
reference_sample <- matrix(
reference, nrow = nrow, ncol = ncol, dimnames = ref_dimnames)
# Add reference sample/feature
if(MARGIN == 1){
mat <- rbind(mat, reference_sample)
# Preserve the original order
mat <- mat[orig_names, ]
} else {
mat <- cbind(mat, reference_sample)
# Preserve the original order
mat <- mat[, orig_names]
}
# Add those attributes that were related to calculation
attributes(mat) <- c(
attributes(mat),
attributes[ !names(attributes) %in% c("dim", "dimnames") ])
return(mat)
}
###############################.apply_pseudocount###############################
# This function applies pseudocount to abundance table.
.apply_pseudocount <- function(mat, pseudocount, na.rm = TRUE, ...){
if( .is_a_bool(pseudocount) ){
# If pseudocount TRUE and some NAs, a warning is issued
if ( pseudocount && any(is.na(mat)) ){
warning("The assay contains missing values (NAs). These will be ",
"ignored in pseudocount calculation.", call. = FALSE)
}
# If pseudocount TRUE but some negative values, numerical pseudocount
# needed
if ( pseudocount && any(mat < 0, na.rm = TRUE) ){
stop("The assay contains negative values. ",
"'pseudocount' must be specified manually.", call. = FALSE)
}
# If pseudocount TRUE, set it to half of non-zero minimum value
# else set it to zero.
# Get min value
value <- min(mat[mat > 0], na.rm = na.rm)
value <- value/2
pseudocount <- ifelse(pseudocount, value, 0)
# Report pseudocount if positive value
if ( pseudocount > 0 ){
message("A pseudocount of ", pseudocount, " was applied.")
}
}
# Give warning if pseudocount should not be added
# Case 1: only positive values
if( pseudocount != 0 && all(mat > 0, na.rm = TRUE) ){
warning(
"The assay contains only positive values. ",
"Applying a pseudocount may be unnecessary.", call. = FALSE)
}
# Case 2: some negative values
if( pseudocount != 0 && any(mat < 0, na.rm = TRUE) ){
warning(
"The assay contains some negative values. ",
"Applying a pseudocount may produce meaningless data.",
call. = FALSE)
}
# Add pseudocount
mat <- mat + pseudocount
# Set attr equal to pseudocount
attr(mat, "pseudocount") <- pseudocount
return(mat)
}
###################### .apply_transformation_from_philr ########################
# This function works as a wrapper for philr::philr
#' @importFrom ape is.rooted is.binary
.apply_transformation_from_philr <- function(
mat, method, MARGIN, x, tree.name = "phylo", tree = NULL,
node.label = NULL, ...){
# We have "soft dependency" for philr package, i.e., it is only required
# in this function.
.require_package("philr")
# Get functions based on MARGIN
tree_check_FUN <- switch(
MARGIN, .check_rowTree_present, .check_colTree_present)
tree_FUN <- switch(MARGIN, rowTree, colTree)
links_FUN <- switch(MARGIN, rowLinks, colLinks)
names_FUN <- switch(MARGIN, rownames, colnames)
names_ass_FUN <- switch(MARGIN, `rownames<-`, `colnames<-`)
n_FUN <- switch(MARGIN, nrow, ncol)
# Do data validity checks
# The original object must be TreeSE or tree must be provided
if( is.null(tree) &&
!(is(x, "TreeSummarizedExperiment") && !is.null(rowTree(x))) ){
stop("'tree' must be provided.", call. = FALSE)
}
# If tree is not specified, then we get rowTree
if( is.null(tree) ){
tree_check_FUN(tree.name, x)
tree <- tree_FUN(x, tree.name)
node.label <- links_FUN(x)[ , "nodeLab" ]
node.label[ links_FUN(x)[, "whichTree"] != tree.name ] <- NA
}
# Check that tree is in correct format
if( !(is.null(tree) || (is(tree, "phylo") &&
!is.null(tree$edge.length)) ) ){
stop("'tree' is NULL or it does not have any branches. The PhILR ",
"transformation is not possible to apply.", call. = FALSE)
}
# Check that node.label is NULL or it specifies links between rownames and
# node labs
if( !( is.null(node.label) ||
is.character(node.label) && length(node.label) == n_FUN(x) ) ){
stop("'node.label' must be NULL or a vector specifying links between ",
"features and node labs of 'tree'.", call. = FALSE)
}
# Subset rows of the assay to correspond node_labs (if there are any NAs
# in node labels)
if( !is.null(node.label) && any(is.na(node.label)) ){
warning("The tree named does not include all the features. 'x' is ",
"subsetted.", call. = FALSE)
if( MARGIN == 1L ){
mat <- mat[ , !is.na(node.label), drop = FALSE]
} else{
mat <- mat[ !is.na(node.label), , drop = FALSE]
}
node.label <- node.label[ !is.na(node.label) ]
}
# If there are node labels, rename the features in matrix to match with
# labels found in tree.
if( !is.null(node.label) ){
mat <- names_ass_FUN(mat, node.label)
}
if( is.null(names_FUN(mat)) ){
stop("'x' must have ", switch(MARGIN, "row", "col"), "names.",
call. = FALSE)
}
# The tree must be rooted and binary
if( !(is.rooted(tree) && is.binary(tree)) ){
warning("The tree should be rooted and binary.", call. = FALSE)
}
# Final check that each row can be found from the tree
if( !all(names_FUN(mat) %in% c(tree$node.label, tree$tip.label)) ){
stop("The feature names in abundance matrix must be found from the ",
"tree.", call. = FALSE)
}
# Transpose if MARGIN is row
if( MARGIN == 1L ){
mat <- t(mat)
}
# Check that the tree is phylo object and rows can be found from it.
mat <- philr::philr(mat, tree, ...)
# Transpose back to original orientation
if( MARGIN == 1L ){
mat <- t(mat)
}
# Add method and margin to attributes
attr(mat, "philr") <- "philr"
attr(mat, "parameters")$margin <- MARGIN
return(mat)
}
# This function is used to add transformed table back to TreeSE. With most of
# the methods it is simple: it is added to assay. However, with philr, the
# features do not match with original ones, so we add philr-transformed data
# to altExp. If philr was, applied to columns, we cannot use altExp so
# we return only the transformed data.
#' @importFrom stats setNames
.add_transformed_data <- function(x, mat, name){
rnames_ok <- nrow(x) == nrow(mat)
cnames_ok <- ncol(x) == ncol(mat)
if( rnames_ok && cnames_ok ){
assay(x, name, withDimnames = FALSE) <- mat
} else if( cnames_ok ){
if( !is(x, "SingleCellExperiment") ){
x <- as(x, "SingleCellExperiment")
}
x_new <- TreeSummarizedExperiment(
assays = setNames(SimpleList(mat), name),
colData = colData(x)
)
altExp(x, name) <- x_new
message("The rows of the transformed data do not match the original ",
"data. The transformed data has been added to altExp(x, name).")
} else{
x <- TreeSummarizedExperiment(
assays = setNames(SimpleList(mat), name)
)
warning("The columns of the transformed data do not match the ",
"original data. The transformed data is returned without the ",
"original data.", call. = FALSE)
}
return(x)
}