Skip to content

Commit 5fe8e0d

Browse files
authored
mergeSEs fixes (microbiome#327)
* do not combine variables with different class * Merge muötiple assays * up * up * up
1 parent 14783bf commit 5fe8e0d

File tree

5 files changed

+142
-31
lines changed

5 files changed

+142
-31
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: mia
22
Type: Package
3-
Version: 1.7.2
3+
Version: 1.7.3
44
Authors@R:
55
c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
66
email = "[email protected]",

NEWS

+2
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,5 @@ Changes in version 1.5.x
6161
Changes in version 1.7.x
6262
+ makePhyloseqFromTreeSE: added option for choosing a tree from multiple rowTrees
6363
+ mergeSEs: match rows based on all available taxonomy level data on rowData
64+
+ mergeSEs: fix bug related to equally named variables that are different class
65+
+ mergeSEs: option for merging multiple assays

R/mergeSEs.R

+88-29
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#' @param y a \code{\link{SummarizedExperiment}} object when \code{x} is a
77
#' \code{\link{SummarizedExperiment}} object. Disabled when \code{x} is a list.
88
#'
9-
#' @param assay_name A single character value for selecting the
9+
#' @param assay_name A character value for selecting the
1010
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}}
1111
#' to be merged. (By default: \code{assay_name = "counts"})
1212
#'
@@ -119,6 +119,11 @@
119119
#' collapse_samples = TRUE)
120120
#' tse_temp
121121
#'
122+
#' # Merge all available assays
123+
#' tse <- relAbundanceCounts(tse)
124+
#' ts1 <- relAbundanceCounts(tse1)
125+
#' tse_temp <- mergeSEs(tse, tse1, assay_name = assayNames(tse))
126+
#'
122127
NULL
123128

124129
################################### Generic ####################################
@@ -140,9 +145,9 @@ setMethod("mergeSEs", signature = c(x = "SimpleList"),
140145
################## Input check ##################
141146
# Check the objects
142147
class <- .check_objects_and_give_class(x)
143-
# Can the assay_name the found form all the objects
144-
assay_name_bool <- .assays_cannot_be_found(assay_name = assay_name, x)
145-
if( any(assay_name_bool) ){
148+
# CHeck which assays can be found, and if any --> FALSE
149+
assay_name <- .assays_cannot_be_found(assay_name = assay_name, x)
150+
if( .is_a_bool(assay_name) && assay_name == FALSE ){
146151
stop("'assay_name' must specify an assay from assays. 'assay_name' ",
147152
"cannot be found at least in one SE object.",
148153
call. = FALSE)
@@ -704,9 +709,7 @@ setMethod("right_join", signature = c(x = "ANY"),
704709
# Remove all information but rowData, colData, metadata and assay
705710
row_data <- rowData(tse)
706711
col_data <- colData(tse)
707-
assay <- assay(tse, assay_name)
708-
assays <- SimpleList(name = assay)
709-
names(assays) <- assay_name
712+
assays <- assays(tse)[ assay_name ]
710713
metadata <- metadata(tse)
711714
# Create a list of arguments
712715
args <- list(assays = assays,
@@ -784,34 +787,53 @@ setMethod("right_join", signature = c(x = "ANY"),
784787

785788
}
786789
########################### .assays_cannot_be_found ############################
787-
# This function checks that the assay can be found from TreeSE objects of a list.
790+
# This function checks that the assay(s) can be found from TreeSE objects of a list.
788791

789792
# Input: the name of the assay and a list of TreeSE objects
790-
# Output: A list of boolean values
793+
# Output: A list of assay_names that can be found or FALSE if any
791794
.assays_cannot_be_found <- function(assay_name, x){
792-
# Check if the assay_name can be found. If yes, then FALSE. If not, then TRUE
793-
list <- lapply(x, .assay_cannot_be_found, assay_name = assay_name)
794-
# Unlist the list
795-
result <- unlist(list)
796-
return(result)
795+
# Loop through objects
796+
assays <- lapply(x, FUN = function(tse){
797+
# Check if the assay_names can be found. If yes, then TRUE. If not, then FALSE
798+
temp <- lapply(assay_name, .assay_cannot_be_found, tse = tse)
799+
# Unlist and return
800+
return( unlist(temp) )
801+
})
802+
# Create a data.frame from the result
803+
assays <- as.data.frame(assays, row.names = assay_name)
804+
colnames(assays) <- paste0("tse", seq_len(length(assays)))
805+
# Which assays can be found from all the objects?
806+
assays <- rownames(assays)[ rowSums(assays) == ncol(assays) ]
807+
# If none of assays were found, return FALSE
808+
if( length(assays) == 0 ){
809+
assays <- FALSE
810+
}
811+
# Give warning if assays were dropped
812+
if( length(assays) < length(assay_name) ){
813+
warning("The following assay(s) was not found from all the objects ",
814+
"so it is dropped from the output: ",
815+
paste0("'", setdiff(assay_name, assays), sep = "'", collapse = ", "),
816+
call. = FALSE)
817+
}
818+
return(assays)
797819
}
798820

799821
############################ .assay_cannot_be_found #############################
800-
# This function checks that the assay can be found from TreeSE. If it cannot be found
801-
# --> TRUE, if it can be found --> FALSE
822+
# This function checks that the assay can be found from TreeSE. If it can be found
823+
# --> TRUE, if it cannot be found --> FALSE
802824

803825
# Input: the name of the assay and TreSE object
804826
# Output: TRUE or FALSE
805827
.assay_cannot_be_found <- function(assay_name, tse){
806-
# Check if the assay_name can be found. If yes, then FALSE. If not, then TRUE
828+
# Check if the assay_name can be found. If yes, then TRUE. If not, then FALSE
807829
tryCatch(
808830
{
809831
.check_assay_present(assay_name, tse)
810-
return(FALSE)
832+
return(TRUE)
811833

812834
},
813835
error = function(cond) {
814-
return(TRUE)
836+
return(FALSE)
815837
}
816838
)
817839
}
@@ -850,9 +872,12 @@ setMethod("right_join", signature = c(x = "ANY"),
850872
rowdata <- .merge_rowdata(tse1, tse2, join)
851873
# Merge colData
852874
coldata <- .merge_coldata(tse1, tse2, join)
853-
# Merge assay
854-
assay <- .merge_assay(tse1, tse2, assay_name, join, missing_values, rowdata, coldata)
855-
assays <- SimpleList(name = assay)
875+
# Merge assays
876+
assays <- lapply(assay_name, .merge_assay,
877+
tse1 = tse1, tse2 = tse2,
878+
join = join, missing_values = missing_values,
879+
rd = rowdata, cd = coldata)
880+
assays <- SimpleList(assays)
856881
names(assays) <- assay_name
857882
# Combine metadata
858883
metadata <- c( metadata(tse1), metadata(tse2) )
@@ -997,12 +1022,16 @@ setMethod("right_join", signature = c(x = "ANY"),
9971022
matching_variables2 <- matching_variables2[ !is.na(matching_variables2) ]
9981023

9991024
# Make the matching variables unique
1000-
matching_variables_mod1 <- paste0(matching_variables1, "_X")
1001-
matching_variables_ids1 <- matching_variables_ids1[ !is.na(matching_variables_ids1) ]
1002-
colnames(df1)[ matching_variables_ids1 ] <- matching_variables_mod1
1003-
matching_variables_mod2 <- paste0(matching_variables2, "_Y")
1004-
matching_variables_ids2 <- matching_variables_ids2[ !is.na(matching_variables_ids2) ]
1005-
colnames(df2)[ matching_variables_ids2 ] <- matching_variables_mod2
1025+
if( length(matching_variables1) > 0 ){
1026+
matching_variables_mod1 <- paste0(matching_variables1, "_X")
1027+
matching_variables_ids1 <-
1028+
matching_variables_ids1[ !is.na(matching_variables_ids1) ]
1029+
colnames(df1)[ matching_variables_ids1 ] <- matching_variables_mod1
1030+
matching_variables_mod2 <- paste0(matching_variables2, "_Y")
1031+
matching_variables_ids2 <-
1032+
matching_variables_ids2[ !is.na(matching_variables_ids2) ]
1033+
colnames(df2)[ matching_variables_ids2 ] <- matching_variables_mod2
1034+
}
10061035

10071036
# Add rownames to one of the columns
10081037
df1$rownames_merge_ID <- rownames(df1)
@@ -1012,9 +1041,39 @@ setMethod("right_join", signature = c(x = "ANY"),
10121041
# Add rownames and remove additional column
10131042
rownames(df) <- df$rownames_merge_ID
10141043
df$rownames_merge_ID <- NULL
1015-
1044+
10161045
# Combine matching variables if found
10171046
if( length(matching_variables1) > 0 ){
1047+
# Get the class of each variable
1048+
class1 <- unlist(lapply(matching_variables_mod1, FUN = function(x){class(df[,x])}))
1049+
class2 <- unlist(lapply(matching_variables_mod2, FUN = function(x){class(df[,x])}))
1050+
# If there are mismatches in classes, variables are not matching
1051+
mismatch <- class1!=class2
1052+
if( any( mismatch) ){
1053+
# Loop through mismatches
1054+
for( i in which(mismatch) ){
1055+
# Givve warning that variables are renamed
1056+
warning("Datasets include equally named variables called '",
1057+
matching_variables1[i], "' but their class differ. \n",
1058+
"In the output, variables are not combined and they are ",
1059+
"renamed based on their class.",
1060+
call. = FALSE)
1061+
# Name variables based on their class
1062+
colnames(df)[ colnames(df) == matching_variables_mod1[i] ] <-
1063+
paste0(matching_variables1[i], "_", class1[i])
1064+
colnames(df)[ colnames(df) == matching_variables_mod2[i] ] <-
1065+
paste0(matching_variables2[i], "_", class2[i])
1066+
# Remove variable from matching list
1067+
matching_variables1 <- matching_variables1[-i]
1068+
matching_variables2 <- matching_variables2[-i]
1069+
matching_variables_mod1 <- matching_variables_mod1[-i]
1070+
matching_variables_mod2 <- matching_variables_mod2[-i]
1071+
}
1072+
}
1073+
}
1074+
# If there are still matching variables
1075+
if( length(matching_variables1) > 0 ){
1076+
# Loop over matching variables
10181077
for(i in 1:length(matching_variables1) ){
10191078
# Get columns
10201079
x <- matching_variables_mod1[i]

man/mergeSEs.Rd

+6-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-2mergeSEs.R

+45
Original file line numberDiff line numberDiff line change
@@ -362,4 +362,49 @@ test_that("mergeSEs", {
362362
rd_gb <- rowData(tse)[rownames(GlobalPatterns), ]
363363
expect_equal(rowData(esophagus), rd_esophagus[, colnames(rowData(esophagus))])
364364
expect_equal(rowData(GlobalPatterns), rd_gb[, colnames(rowData(GlobalPatterns))])
365+
366+
# Check that variables with different class are not combined
367+
tse1 <- esophagus
368+
tse2 <- GlobalPatterns
369+
tse3 <- GlobalPatterns[1:50, 1:10]
370+
# Create variables with different class
371+
colData(tse1)$group <- sample(c(1, 2, 3), ncol(tse1), replace = TRUE)
372+
colData(tse2)$group <- sample(c("Group1", "Group2", "Group3"), ncol(tse2),
373+
replace = TRUE)
374+
colData(tse3)$group <- as.factor(sample(c("Group1", "Group2", "Group3"),
375+
ncol(tse3), replace = TRUE))
376+
tse <- expect_warning(mergeSEs(list(tse1, tse2, tse3)))
377+
expect_true(ncol(colData(tse)) == length(unique(c( colnames(colData(tse1)),
378+
colnames(colData(tse2)),
379+
colnames(colData(tse3)))
380+
))+2)
381+
382+
# Check that multiple assays are supported
383+
tse1 <- relAbundanceCounts(tse1)
384+
tse2 <- relAbundanceCounts(tse2)
385+
tse3 <- relAbundanceCounts(tse3)
386+
387+
tse_temp <- expect_warning( mergeSEs(list(tse1, tse2, tse3),
388+
assay_name = c("counts",
389+
"relabundance"),
390+
join = "inner"))
391+
expect_equal(assayNames(tse_temp), c("counts", "relabundance"))
392+
tse_temp <- expect_warning(mergeSEs(list(tse1, tse2),
393+
assay_name = c("counts", "relabundance", "test"),
394+
join = "left"))
395+
expect_equal(assayNames(tse_temp), c("counts", "relabundance"))
396+
397+
# Test that reference sequences stay the same
398+
# Load data from miaTime package
399+
skip_if_not(require("miaTime", quietly = TRUE))
400+
data("SilvermanAGutData")
401+
tse <- SilvermanAGutData
402+
tse1 <- tse
403+
rownames(tse1) <- paste0("Taxon", 1:nrow(tse))
404+
# Merge
405+
tse2 <- mergeSEs(tse1, tse)
406+
# Test refseqs
407+
ref1 <- referenceSeq(tse)
408+
ref2 <- referenceSeq(tse2)[rownames(tse), ]
409+
expect_equal(ref1, ref2)
365410
})

0 commit comments

Comments
 (0)