20
20
# ' @param collapse_samples A boolean value for selecting whether to collapse identically
21
21
# ' named samples to one. (By default: \code{collapse_samples = FALSE})
22
22
# '
23
+ # ' @param collapse_features A boolean value for selecting whether to collapse identically
24
+ # ' named features to one. Since all taxonomy information is taken into account,
25
+ # ' this concerns rownames-level (usually strain level) comparison. Often
26
+ # ' OTU or ASV level is just an arbitrary number series from sequencing machine
27
+ # ' meaning that the OTU information is not comparable between studies. With this
28
+ # ' option, it is possible to specify whether these strains are combined if their
29
+ # ' taxonomy information along with OTU number matches.
30
+ # ' (By default: \code{collapse_features = TRUE})
31
+ # '
23
32
# ' @param verbose A single boolean value to choose whether to show messages.
24
33
# ' (By default: \code{verbose = TRUE})
25
34
# '
@@ -140,7 +149,8 @@ setGeneric("mergeSEs", signature = c("x"),
140
149
# ' @export
141
150
setMethod ("mergeSEs ", signature = c(x = "SimpleList"),
142
151
function (x , assay_name = " counts" , join = " full" ,
143
- missing_values = NA , collapse_samples = FALSE , verbose = TRUE ,
152
+ missing_values = NA , collapse_samples = FALSE ,
153
+ collapse_features = TRUE , verbose = TRUE ,
144
154
... ){
145
155
# ################# Input check ##################
146
156
# Check the objects
@@ -179,6 +189,11 @@ setMethod("mergeSEs", signature = c(x = "SimpleList"),
179
189
stop(" 'collapse_samples' must be TRUE or FALSE." ,
180
190
call. = FALSE )
181
191
}
192
+ # Check collapse_samples
193
+ if ( ! .is_a_bool(collapse_features ) ){
194
+ stop(" 'collapse_features' must be TRUE or FALSE." ,
195
+ call. = FALSE )
196
+ }
182
197
# Check verbose
183
198
if ( ! .is_a_bool(verbose ) ){
184
199
stop(" 'verbose' must be TRUE or FALSE." ,
@@ -191,8 +206,9 @@ setMethod("mergeSEs", signature = c(x = "SimpleList"),
191
206
message(" 1/" , length(x ), appendLF = FALSE )
192
207
}
193
208
# Merge objects
194
- tse <- .merge_SEs(x , class , join , assay_name ,
195
- missing_values , collapse_samples , verbose )
209
+ tse <- .merge_SEs(
210
+ x , class , join , assay_name , missing_values , collapse_samples ,
211
+ collapse_features , verbose )
196
212
return (tse )
197
213
}
198
214
)
@@ -303,10 +319,13 @@ setMethod("right_join", signature = c(x = "ANY"),
303
319
# Output: SE
304
320
305
321
# ' @importFrom SingleCellExperiment SingleCellExperiment
306
- .merge_SEs <- function (x , class , join , assay_name ,
307
- missing_values , collapse_samples , verbose ){
322
+ .merge_SEs <- function (
323
+ x , class , join , assay_name , missing_values , collapse_samples ,
324
+ collapse_features , verbose ){
308
325
# Add rowData info to rownames
309
- x <- lapply(x , FUN = .add_rowdata_to_rownames )
326
+ rownames_name <- " rownames_that_will_be_used_to_adjust_names"
327
+ x <- lapply(x , FUN = .add_rowdata_to_rownames ,
328
+ rownames_name = rownames_name )
310
329
# Take first element and remove it from the list
311
330
tse <- x [[1 ]]
312
331
x [[1 ]] <- NULL
@@ -347,7 +366,10 @@ setMethod("right_join", signature = c(x = "ANY"),
347
366
348
367
# Modify names if specified
349
368
if ( ! collapse_samples ){
350
- temp <- .get_unique_sample_names(tse , temp , i + 1 )
369
+ temp <- .get_unique_names(tse , temp , " col" )
370
+ }
371
+ if ( ! collapse_features ){
372
+ temp <- .get_unique_names(tse , temp , " row" )
351
373
}
352
374
# Merge data
353
375
args <- .merge_SummarizedExperiments(
@@ -387,7 +409,6 @@ setMethod("right_join", signature = c(x = "ANY"),
387
409
tse <- .check_and_add_refSeqs(tse , refSeqs , verbose )
388
410
}
389
411
# Adjust rownames
390
- rownames_name <- " rownames_that_will_be_used_to_adjust_names"
391
412
rownames(tse ) <- rowData(tse )[[rownames_name ]]
392
413
rowData(tse )[[rownames_name ]] <- NULL
393
414
return (tse )
@@ -397,11 +418,10 @@ setMethod("right_join", signature = c(x = "ANY"),
397
418
# This function adds taxonomy information to rownames to enable more specific match
398
419
# between rows
399
420
400
- # Input: (Tree)SE
421
+ # Input: (Tree)SE, name of the column that is being added to rowData
401
422
# Output: (Tree)SE with rownames that include all taxonomy information
402
- .add_rowdata_to_rownames <- function (x ){
423
+ .add_rowdata_to_rownames <- function (x , rownames_name , ... ){
403
424
# Add rownames to rowData
404
- rownames_name <- " rownames_that_will_be_used_to_adjust_names"
405
425
rowData(x )[[rownames_name ]] <- rownames(x )
406
426
# Get rowData
407
427
rd <- rowData(x )
@@ -461,7 +481,7 @@ setMethod("right_join", signature = c(x = "ANY"),
461
481
temp_seqs <- do.call(c , temp_seqs )
462
482
# Get only those taxa that are included in TreeSE
463
483
temp_seqs <- temp_seqs [ match(rownames(tse ), names(temp_seqs )), ]
464
- # Add combined ssequences into a list
484
+ # Add combined sequences into a list
465
485
result_list <- c(result_list , temp_seqs )
466
486
}
467
487
# Create a DNAStrinSetList if there are more than one element
@@ -499,63 +519,33 @@ setMethod("right_join", signature = c(x = "ANY"),
499
519
}
500
520
# All rownames/colnames should be included in trees/links
501
521
if ( ! all(names %in% links [[" names" ]]) || is.null(names ) ){
502
- warning(MARGIN , " Tree(s) does not match with the data so it is discarded. " ,
503
- call. = FALSE )
522
+ warning(MARGIN , " Tree(s) does not match with the data so it " ,
523
+ " is discarded. " , call. = FALSE )
504
524
return (tse )
505
525
}
506
526
507
- # If there are multiple trees, select non-duplicated trees, best fitting
508
- # combination of trees. Get minimum number of trees that represent the data
509
- # based on link data .
527
+ # If there are multiple trees, select non-duplicated trees; the largest
528
+ # take the precedence, remove duplicated rowlinks --> each row is presented
529
+ # in the set only once --> remove trees that do not have any values anymore .
510
530
if ( length(trees ) > 1 ){
511
- # From the links, for each tree, get row/cols that are linked with tree
512
- tree_labs <- split(links [[" nodeLab" ]], f = links $ whichTree )
513
-
514
- # Loop thorugh tree labs, check which trees include which node labs
515
- result <- lapply(tree_labs , FUN = function (x ){
516
- c( links [[" nodeLab" ]] %in% x )
517
- })
518
- # Create a data.frame
519
- result <- as.data.frame(result )
520
-
521
- # Loop from 1 to number of trees
522
- for ( i in seq_len(ncol(result )) ){
523
- # Create all possible combinations from trees, each combination has i trees.
524
- combinations <- combn(result , i , simplify = FALSE )
525
- # Does this combination have all the node labels (rows or columns)
526
- res <- lapply(combinations , FUN = function (x ){
527
- all( rowSums(x ) > 0 )
528
- })
529
- # Unlist the list of boolean values
530
- res <- unlist(res )
531
- # If combination that includes all the rows/cols was found
532
- if ( any(res ) ){
533
- # Take the first combination that have all the rows/cols
534
- combinations <- combinations [[which(res )[[1 ]]]]
535
- # Take the names of trees
536
- tree_names <- colnames(combinations )
537
- # Break so that for loop is not continued anymore
538
- break
539
- }
540
- }
541
- # Get the trees that are included in the final combination
542
- trees <- trees [tree_names ]
543
- # Subset result by taking only those trees that are included in final object
544
- result <- result [ , tree_names , drop = FALSE ]
545
- # In which tree this node label is found (each row represent each node label)
546
- whichTree <- apply(result , 1 , FUN = function (x ){
547
- names(result )[x == TRUE ][[1 ]]
548
- }
549
- )
550
- whichTree <- unlist(whichTree )
551
- # Update links
552
- links [[" whichTree" ]] <- whichTree
553
- # Remove duplicates
554
- links <- links [ ! duplicated(links [[" names" ]]), ]
555
- # Ensure that links are in correct order
556
- links <- links [ match(names , links [[" names" ]]), ]
531
+ # Sort trees --> trees with highest number of taxa first
532
+ max_trees <- table(links $ whichTree )
533
+ max_trees <- names(max_trees )[order(max_trees , decreasing = TRUE )]
534
+ # Order the link data frame, take largest trees first
535
+ links $ whichTree <- factor (links $ whichTree , levels = max_trees )
536
+ links <- links [order(links $ whichTree ), ]
537
+ # Remove factorization
538
+ links $ whichTree <- unfactor(links $ whichTree )
539
+ # Remove duplicated links
540
+ links <- links [! duplicated(links $ names ), ]
541
+ # Subset trees
542
+ trees <- trees [unique(links $ whichTree )]
557
543
}
558
544
545
+ # Order the data to match created TreeSE
546
+ links <- links [rownames(tse ), ]
547
+ trees <- trees [unique(links $ whichTree )]
548
+
559
549
# Create a LinkDataFrame based on the link data
560
550
links <- LinkDataFrame(
561
551
nodeLab = links [[" nodeLab" ]],
@@ -838,21 +828,36 @@ setMethod("right_join", signature = c(x = "ANY"),
838
828
)
839
829
}
840
830
841
- # ########################## .get_unique_sample_names ###########################
831
+ # ########################## ..get_unique_names ###########################
842
832
# This function convert colnames unique
843
833
844
- # Input: TreeSEs
834
+ # Input: TreeSEs and MARGIN
845
835
# Output: One TreeSE with unique sample names compared to other TreeSE
846
- .get_unique_sample_names <- function (tse1 , tse2 , iteration ){
847
- # Get indices of those sample names that match
848
- ind <- colnames(tse2 ) %in% colnames(tse1 )
849
- # Get duplicated sample names
850
- duplicated_colnames <- colnames(tse2 )[ind ]
851
- if ( length(duplicated_colnames ) > 0 ) {
852
- # Add the number of object to duplicated sample names
853
- duplicated_colnames <- paste0(duplicated_colnames , " _" , iteration )
854
- # Add new sample names to the tse object
855
- colnames(tse2 )[ind ] <- duplicated_colnames
836
+ .get_unique_names <- function (tse1 , tse2 , MARGIN , suffix = 2 ){
837
+ # Based on MARGIN, get right names
838
+ if ( MARGIN == " row" ){
839
+ names1 <- rownames(tse1 )
840
+ names2 <- rownames(tse2 )
841
+ } else {
842
+ names1 <- colnames(tse1 )
843
+ names2 <- colnames(tse2 )
844
+ }
845
+ # If there are duplicated names
846
+ if ( any(names2 %in% names1 ) ){
847
+ # Get duplicated names
848
+ ind <- names2 %in% names1
849
+ temp_names2 <- names2 [ind ]
850
+ # Get unique suffix
851
+ while ( any(paste0(names2 , " ." , suffix ) %in% names1 ) ){
852
+ suffix <- suffix + 1
853
+ }
854
+ temp_names2 <- paste0(temp_names2 , " ." , suffix )
855
+ # Assign names back
856
+ if ( MARGIN == " row" ){
857
+ rownames(tse2 )[ind ] <- temp_names2
858
+ } else {
859
+ colnames(tse2 )[ind ] <- temp_names2
860
+ }
856
861
}
857
862
return (tse2 )
858
863
}
0 commit comments