forked from microbiome/mia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimportHumann.R
187 lines (177 loc) · 7.42 KB
/
importHumann.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#' Import HUMAnN results to \code{TreeSummarizedExperiment}
#
#' @param file \code{Character scalar}. Defines the file
#' path of the HUMAnN file. The file must be in merged HUMAnN format.
#'
#' @param col.data a DataFrame-like object that includes sample names in
#' rownames, or a single \code{character} value defining the file
#' path of the sample metadata file. The file must be in \code{tsv} format
#' (Default: \code{NULL}).
#'
#' @param colData Deprecated. Use \code{col.data} instead.
#'
#' @param ... additional arguments:
#' \itemize{
#' \item \code{assay.type}: \code{Character scalar}. Specifies the name of
#' the assay used in calculation. (Default: \code{"counts"})
#' \item \code{prefix.rm}: \code{Logical scalar}. Should
#' taxonomic prefixes be removed? (Default: \code{FALSE})
#' \item \code{remove.suffix}: \code{Logical scalar}. Should
#' suffixes of sample names be removed? HUMAnN pipeline adds suffixes
#' to sample names. Suffixes are formed from file names. By selecting
#' \code{remove.suffix = TRUE}, you can remove pattern from end of sample
#' names that is shared by all. (Default: \code{FALSE})
#' }
#'
#' @details
#' Import HUMAnN (currently version 3.0 supported) results of functional
#' predictions based on metagenome composition (e.g. pathways or gene families).
#' The input must be in merged HUMAnN format. (See
#' \href{https://github.com/biobakery/humann#humann_join_tables}{the HUMAnN
#' documentation and \code{humann_join_tables} method.})
#'
#' The function parses gene/pathway information along with taxonomy information
#' from the input file. This information is stored to \code{rowData}. Abundances
#' are stored to \code{assays}.
#'
#' Usually the workflow includes also taxonomy data from Metaphlan. See
#' \link[=importMetaPhlAn]{importMetaPhlAn} to load the data to \code{TreeSE}.
#'
#' @return A
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object
#'
#' @name importHUMAnN
#'
#' @seealso
#' \code{\link[=importMetaPhlAn]{importMetaPhlAn}}
#' \code{\link[=convertFromPhyloseq]{convertFromPhyloseq}}
#' \code{\link[=convertFromBIOM]{convertFromBIOM}}
#' \code{\link[=convertFromDADA2]{convertFromDADA2}}
#' \code{\link[=importQIIME2]{importQIIME2}}
#' \code{\link[=importMothur]{importMothur}}
#'
#' @export
#'
#' @references
#' Beghini F, McIver LJ, Blanco-Míguez A, Dubois L, Asnicar F, Maharjan S,
#' Mailyan A, Manghi P, Scholz M, Thomas AM, Valles-Colomer M, Weingart G,
#' Zhang Y, Zolfo M, Huttenhower C, Franzosa EA, & Segata N (2021)
#' Integrating taxonomic, functional, and strain-level profiling of diverse
#' microbial communities with bioBakery 3. \emph{eLife}. 10:e65088.
#'
#' @examples
#' # File path
#' file_path <- system.file("extdata", "humann_output.tsv", package = "mia")
#' # Import data
#' tse <- importHUMAnN(file_path)
#' tse
#'
NULL
importHUMAnN <- function(file, col.data = colData, colData = NULL, ...){
################################ Input check ###############################
if(!.is_non_empty_string(file)){
stop("'file' must be a single character value.",
call. = FALSE)
}
if (!file.exists(file)) {
stop(file, " does not exist", call. = FALSE)
}
if(!is.null(col.data) &&
!(.is_non_empty_string(col.data) || is.data.frame(col.data) ||
is.matrix(col.data) || is(col.data, "DataFrame")) ){
stop("'col.data' must be a single character value, DataFrame or NULL.",
call. = FALSE)
}
############################## Input check end #############################
# Humann files has these columns that goes to rowData
rowdata_col <- c("Pathway", "Gene_Family")
# Read metaphlan data
data <- .read_humann(file, rowdata_col, ...)
# Create TreeSE from the data
tse <- .create_tse_from_humann(data, rowdata_col, ...)
# Add col.data if provided
if( !is.null(col.data) ){
tse <- .add_coldata(tse, col.data)
}
return(tse)
}
################################ HELP FUNCTIONS ################################
# Read Humann file, catch error if it occurs
#' @importFrom utils read.delim
.read_humann <- function(file, rowdata_col, remove.suffix = FALSE, ...){
################################ Input check ###############################
if(!.is_a_bool(remove.suffix)){
stop("'remove.suffix' must be TRUE or FALSE.", call. = FALSE)
}
############################## Input check end #############################
# Read the table. Catch error and give more informative message
table <- tryCatch(
{
read.delim(file, check.names = FALSE)
},
error = function(condition){
stop("Cannot read the file: ", file,
"\nPlease check that the file is in merged HUMAnN file ",
"format.", call. = FALSE)
}
)
# In the first column name, there is "# " prefix. Remove it
colnames(table)[1] <- gsub("# ", "", colnames(table)[1])
# Replace spaces with underscore
colnames(table)[1] <- gsub(" ", "_", colnames(table)[1])
# Remove possible suffix from the colnames if user has specified
if( remove.suffix ){
table <- .remove_suffix(table, rowdata_col)
}
# Add rownames
rownames(table) <- table[, 1]
# Check that file is in right format
if( .check_metaphlan(table, rowdata_col) ){
stop("Cannot read the file: ", file,
"\nPlease check that the file is in merged HUMAnN file format.",
call. = FALSE)
}
return(table)
}
# This function parses humann file and creates tse from it.
.create_tse_from_humann <- function(
data, rowdata_col, assay.type = "counts", ...){
# Get rowdata columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(data)))
rowdata <- data[ , rowdata_id, drop = FALSE]
# Get columns that go to assay
assay <- data[ , -rowdata_id, drop = FALSE]
# Parse rowdata. The data includes gene/pathway info along with taxonomy
# info. Separate the information to own columns.
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(rowdata)))
# Get the column and split gene/pathway and taxonomy info
rowdata_temp <- strsplit(rowdata[[1]], "\\|",)
# Are some rows missing taxonomy info? Get their indices.
missing_taxa <- lengths(rowdata_temp) == 1
# Create a df from the list of splitted data.
rowdata_temp <- data.frame(t(data.frame(rowdata_temp)))
# Replace missing taxonomy info with NA
rowdata_temp[missing_taxa, 2] <- NA
# Replace column names
colnames(rowdata_temp) <- c(colnames(rowdata), "Taxonomy")
# Now we have rowdata that includes gene/pathway info in one column and
# taxonomy info in other. Let's parse the taxonomy info so that species
# genus etc levels are in unique columns.
taxonomy <- .parse_taxonomy(
rowdata_temp, column_name = "Taxonomy", sep = "\\.", ...)
# Convert all data to DataFrame
rowdata <- DataFrame(rowdata)
rowdata_temp <- DataFrame(rowdata_temp)
# Create rowdata from the information that is parsed
colnames(rowdata) <- paste0(colnames(rowdata), "_long")
rowdata <- cbind(rowdata, rowdata_temp[, 1, drop = FALSE])
rowdata <- cbind(rowdata, taxonomy)
# Create assays
assay <- as.matrix(assay)
assays <- SimpleList(counts = assay)
names(assays) <- assay.type
# Create TreeSE
tse <- TreeSummarizedExperiment(assays = assays, rowData = rowdata)
return(tse)
}