Skip to content

Commit c93b493

Browse files
mergeSEs: simplify (microbiome#344)
* up * up * up * up * up * up * up * Version bump, news * up --------- Co-authored-by: Leo Lahti <[email protected]>
1 parent e4bbff2 commit c93b493

File tree

5 files changed

+99
-81
lines changed

5 files changed

+99
-81
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: mia
22
Type: Package
3-
Version: 1.7.6
3+
Version: 1.7.7
44
Authors@R:
55
c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
66
email = "[email protected]",

NEWS

+4-1
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,8 @@ Changes in version 1.7.x
6666
+ mergeSEs: option for merging multiple assays
6767
+ calculateUnifrac: option for specifying the tree from TreeSE
6868
+ transformCounts: utilize vegan package
69+
+ calculateUnifrac: subset tree based on data
70+
+ agglomerateByRank: take into account multiple trees
71+
+ loadFromBiom: name columns of rowData based on prefixes
6972
+ Deprecate transformSamples, *Features, relabundance
70-
73+
+ mergeSEs: faster tree merging

R/mergeSEs.R

+80-75
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@
2020
#' @param collapse_samples A boolean value for selecting whether to collapse identically
2121
#' named samples to one. (By default: \code{collapse_samples = FALSE})
2222
#'
23+
#' @param collapse_features A boolean value for selecting whether to collapse identically
24+
#' named features to one. Since all taxonomy information is taken into account,
25+
#' this concerns rownames-level (usually strain level) comparison. Often
26+
#' OTU or ASV level is just an arbitrary number series from sequencing machine
27+
#' meaning that the OTU information is not comparable between studies. With this
28+
#' option, it is possible to specify whether these strains are combined if their
29+
#' taxonomy information along with OTU number matches.
30+
#' (By default: \code{collapse_features = TRUE})
31+
#'
2332
#' @param verbose A single boolean value to choose whether to show messages.
2433
#' (By default: \code{verbose = TRUE})
2534
#'
@@ -140,7 +149,8 @@ setGeneric("mergeSEs", signature = c("x"),
140149
#' @export
141150
setMethod("mergeSEs", signature = c(x = "SimpleList"),
142151
function(x, assay_name = "counts", join = "full",
143-
missing_values = NA, collapse_samples = FALSE, verbose = TRUE,
152+
missing_values = NA, collapse_samples = FALSE,
153+
collapse_features = TRUE, verbose = TRUE,
144154
... ){
145155
################## Input check ##################
146156
# Check the objects
@@ -179,6 +189,11 @@ setMethod("mergeSEs", signature = c(x = "SimpleList"),
179189
stop("'collapse_samples' must be TRUE or FALSE.",
180190
call. = FALSE)
181191
}
192+
# Check collapse_samples
193+
if( !.is_a_bool(collapse_features) ){
194+
stop("'collapse_features' must be TRUE or FALSE.",
195+
call. = FALSE)
196+
}
182197
# Check verbose
183198
if( !.is_a_bool(verbose) ){
184199
stop("'verbose' must be TRUE or FALSE.",
@@ -191,8 +206,9 @@ setMethod("mergeSEs", signature = c(x = "SimpleList"),
191206
message("1/", length(x), appendLF = FALSE)
192207
}
193208
# Merge objects
194-
tse <- .merge_SEs(x, class, join, assay_name,
195-
missing_values, collapse_samples, verbose)
209+
tse <- .merge_SEs(
210+
x, class, join, assay_name, missing_values, collapse_samples,
211+
collapse_features, verbose)
196212
return(tse)
197213
}
198214
)
@@ -303,10 +319,13 @@ setMethod("right_join", signature = c(x = "ANY"),
303319
# Output: SE
304320

305321
#' @importFrom SingleCellExperiment SingleCellExperiment
306-
.merge_SEs <- function(x, class, join, assay_name,
307-
missing_values, collapse_samples, verbose){
322+
.merge_SEs <- function(
323+
x, class, join, assay_name, missing_values, collapse_samples,
324+
collapse_features, verbose){
308325
# Add rowData info to rownames
309-
x <- lapply(x, FUN = .add_rowdata_to_rownames)
326+
rownames_name <- "rownames_that_will_be_used_to_adjust_names"
327+
x <- lapply(x, FUN = .add_rowdata_to_rownames,
328+
rownames_name = rownames_name)
310329
# Take first element and remove it from the list
311330
tse <- x[[1]]
312331
x[[1]] <- NULL
@@ -347,7 +366,10 @@ setMethod("right_join", signature = c(x = "ANY"),
347366

348367
# Modify names if specified
349368
if( !collapse_samples ){
350-
temp <- .get_unique_sample_names(tse, temp, i+1)
369+
temp <- .get_unique_names(tse, temp, "col")
370+
}
371+
if( !collapse_features ){
372+
temp <- .get_unique_names(tse, temp, "row")
351373
}
352374
# Merge data
353375
args <- .merge_SummarizedExperiments(
@@ -387,7 +409,6 @@ setMethod("right_join", signature = c(x = "ANY"),
387409
tse <- .check_and_add_refSeqs(tse, refSeqs, verbose)
388410
}
389411
# Adjust rownames
390-
rownames_name <- "rownames_that_will_be_used_to_adjust_names"
391412
rownames(tse) <- rowData(tse)[[rownames_name]]
392413
rowData(tse)[[rownames_name]] <- NULL
393414
return(tse)
@@ -397,11 +418,10 @@ setMethod("right_join", signature = c(x = "ANY"),
397418
# This function adds taxonomy information to rownames to enable more specific match
398419
# between rows
399420

400-
# Input: (Tree)SE
421+
# Input: (Tree)SE, name of the column that is being added to rowData
401422
# Output: (Tree)SE with rownames that include all taxonomy information
402-
.add_rowdata_to_rownames <- function(x){
423+
.add_rowdata_to_rownames <- function(x, rownames_name, ...){
403424
# Add rownames to rowData
404-
rownames_name <- "rownames_that_will_be_used_to_adjust_names"
405425
rowData(x)[[rownames_name]] <- rownames(x)
406426
# Get rowData
407427
rd <- rowData(x)
@@ -461,7 +481,7 @@ setMethod("right_join", signature = c(x = "ANY"),
461481
temp_seqs <- do.call(c, temp_seqs)
462482
# Get only those taxa that are included in TreeSE
463483
temp_seqs <- temp_seqs[ match(rownames(tse), names(temp_seqs)), ]
464-
# Add combined ssequences into a list
484+
# Add combined sequences into a list
465485
result_list <- c(result_list, temp_seqs)
466486
}
467487
# Create a DNAStrinSetList if there are more than one element
@@ -499,63 +519,33 @@ setMethod("right_join", signature = c(x = "ANY"),
499519
}
500520
# All rownames/colnames should be included in trees/links
501521
if( !all(names %in% links[["names"]]) || is.null(names) ){
502-
warning(MARGIN, "Tree(s) does not match with the data so it is discarded.",
503-
call. = FALSE)
522+
warning(MARGIN, "Tree(s) does not match with the data so it ",
523+
"is discarded.", call. = FALSE)
504524
return(tse)
505525
}
506526

507-
# If there are multiple trees, select non-duplicated trees, best fitting
508-
# combination of trees. Get minimum number of trees that represent the data
509-
# based on link data.
527+
# If there are multiple trees, select non-duplicated trees; the largest
528+
# take the precedence, remove duplicated rowlinks --> each row is presented
529+
# in the set only once --> remove trees that do not have any values anymore.
510530
if( length(trees) > 1 ){
511-
# From the links, for each tree, get row/cols that are linked with tree
512-
tree_labs <- split(links[["nodeLab"]], f = links$whichTree)
513-
514-
# Loop thorugh tree labs, check which trees include which node labs
515-
result <- lapply(tree_labs, FUN = function(x){
516-
c( links[["nodeLab"]] %in% x )
517-
})
518-
# Create a data.frame
519-
result <- as.data.frame(result)
520-
521-
# Loop from 1 to number of trees
522-
for( i in seq_len(ncol(result)) ){
523-
# Create all possible combinations from trees, each combination has i trees.
524-
combinations <- combn(result, i, simplify = FALSE)
525-
# Does this combination have all the node labels (rows or columns)
526-
res <- lapply(combinations, FUN = function(x){
527-
all( rowSums(x) > 0 )
528-
})
529-
# Unlist the list of boolean values
530-
res <- unlist(res)
531-
# If combination that includes all the rows/cols was found
532-
if( any(res) ){
533-
# Take the first combination that have all the rows/cols
534-
combinations <- combinations[[which(res)[[1]]]]
535-
# Take the names of trees
536-
tree_names <- colnames(combinations)
537-
# Break so that for loop is not continued anymore
538-
break
539-
}
540-
}
541-
# Get the trees that are included in the final combination
542-
trees <- trees[tree_names]
543-
# Subset result by taking only those trees that are included in final object
544-
result <- result[ , tree_names, drop = FALSE]
545-
# In which tree this node label is found (each row represent each node label)
546-
whichTree <- apply(result, 1, FUN = function(x){
547-
names(result)[x == TRUE][[1]]
548-
}
549-
)
550-
whichTree <- unlist(whichTree)
551-
# Update links
552-
links[["whichTree"]] <- whichTree
553-
# Remove duplicates
554-
links <- links[ !duplicated(links[["names"]]), ]
555-
# Ensure that links are in correct order
556-
links <- links[ match(names, links[["names"]]), ]
531+
# Sort trees --> trees with highest number of taxa first
532+
max_trees <- table(links$whichTree)
533+
max_trees <- names(max_trees)[order(max_trees, decreasing = TRUE)]
534+
# Order the link data frame, take largest trees first
535+
links$whichTree <- factor(links$whichTree, levels = max_trees)
536+
links <- links[order(links$whichTree), ]
537+
# Remove factorization
538+
links$whichTree <- unfactor(links$whichTree)
539+
# Remove duplicated links
540+
links <- links[!duplicated(links$names), ]
541+
# Subset trees
542+
trees <- trees[unique(links$whichTree)]
557543
}
558544

545+
# Order the data to match created TreeSE
546+
links <- links[rownames(tse), ]
547+
trees <- trees[unique(links$whichTree)]
548+
559549
# Create a LinkDataFrame based on the link data
560550
links <- LinkDataFrame(
561551
nodeLab = links[["nodeLab"]],
@@ -838,21 +828,36 @@ setMethod("right_join", signature = c(x = "ANY"),
838828
)
839829
}
840830

841-
########################### .get_unique_sample_names ###########################
831+
########################### ..get_unique_names ###########################
842832
# This function convert colnames unique
843833

844-
# Input: TreeSEs
834+
# Input: TreeSEs and MARGIN
845835
# Output: One TreeSE with unique sample names compared to other TreeSE
846-
.get_unique_sample_names <- function(tse1, tse2, iteration){
847-
# Get indices of those sample names that match
848-
ind <- colnames(tse2) %in% colnames(tse1)
849-
# Get duplicated sample names
850-
duplicated_colnames <- colnames(tse2)[ind]
851-
if( length(duplicated_colnames) > 0 ) {
852-
# Add the number of object to duplicated sample names
853-
duplicated_colnames <- paste0(duplicated_colnames, "_", iteration)
854-
# Add new sample names to the tse object
855-
colnames(tse2)[ind] <- duplicated_colnames
836+
.get_unique_names <- function(tse1, tse2, MARGIN, suffix=2){
837+
# Based on MARGIN, get right names
838+
if( MARGIN == "row" ){
839+
names1 <- rownames(tse1)
840+
names2 <- rownames(tse2)
841+
} else{
842+
names1 <- colnames(tse1)
843+
names2 <- colnames(tse2)
844+
}
845+
# If there are duplicated names
846+
if( any(names2 %in% names1) ){
847+
# Get duplicated names
848+
ind <- names2 %in% names1
849+
temp_names2 <- names2[ind]
850+
# Get unique suffix
851+
while( any(paste0(names2, ".", suffix) %in% names1) ){
852+
suffix <- suffix + 1
853+
}
854+
temp_names2 <- paste0(temp_names2, ".", suffix)
855+
# Assign names back
856+
if( MARGIN == "row" ){
857+
rownames(tse2)[ind] <- temp_names2
858+
} else{
859+
colnames(tse2)[ind] <- temp_names2
860+
}
856861
}
857862
return(tse2)
858863
}

man/mergeSEs.Rd

+10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-2mergeSEs.R

+4-4
Original file line numberDiff line numberDiff line change
@@ -331,13 +331,13 @@ test_that("mergeSEs", {
331331
tse_test <- mergeSEs(x = list(tse[1:28, 1:3], tse[23, 1:5], tse[1, 1:10]),
332332
join = "full")
333333
expect_equal( dim(tse_test), c(28, 18))
334-
expect_true( (all( c( paste0(rep(colnames(tse[, 1:3]), each=3), c("", "_2", "_3")),
335-
paste0(rep(colnames(tse[, 4:5]), each=2), c("", "_3")),
334+
expect_true( (all( c( paste0(rep(colnames(tse[, 1:3]), each=3), c("", ".2", ".3")),
335+
paste0(rep(colnames(tse[, 4:5]), each=2), c("", ".3")),
336336
colnames(tse[, 6:10]) ) %in%
337337
colnames(tse_test) ) &&
338338
all( colnames(tse_test) %in%
339-
c( paste0(rep(colnames(tse[, 1:3]), each=3), c("", "_2", "_3")),
340-
paste0(rep(colnames(tse[, 4:5]), each=2), c("", "_3")),
339+
c( paste0(rep(colnames(tse[, 1:3]), each=3), c("", ".2", ".3")),
340+
paste0(rep(colnames(tse[, 4:5]), each=2), c("", ".3")),
341341
colnames(tse[, 6:10]) ) ) )
342342
)
343343
# Test that tree is added after agglomeration

0 commit comments

Comments
 (0)