forked from microbiome/mia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddNMF.R
171 lines (168 loc) · 5.97 KB
/
addNMF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#' Non-negative Matrix Factorization
#'
#' These functions perform Non-negative Matrix Factorization on data stored in a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object.
#'
#' @param x a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object.
#'
#' @param k \code{numeric vector}. A number of latent vectors/topics.
#' (Default: \code{2})
#'
#' @param name \code{Character scalar}. The name to be used to store the result
#' in the reducedDims of the output. (Default: \code{"NMF"})
#'
#' @param assay.type \code{Character scalar}. Specifies which assay to use for
#' NMF ordination. (Default: \code{"counts"})
#'
#' @param eval.metric \code{Character scalar}. Specifies the evaluation metric
#' that will be used to select the model with the best fit. Must be one of the
#' following options: \code{"evar"} (explained variance; maximized),
#' \code{"sparseness.basis"} (degree of sparsity in the basis matrix;
#' maximized), \code{"sparseness.coef"} (degree of sparsity in the coefficient
#' matrix; maximized), \code{"rss"} (residual sum of squares; minimized),
#' \code{"silhouette.coef"} (quality of clustering based on the coefficient
#' matrix; maximized), \code{"silhouette.basis"} (quality of clustering based
#' on the basis matrix; maximized), \code{"cophenetic"} (correlation between
#' cophenetic distances and original distances; maximized), \code{"dispersion"}
#' (spread of data points within clusters; minimized). (Default: \code{"evar"})
#'
#' @param ... optional arguments passed to \code{nmf::NMF}.
#'
#' @return
#' For \code{getNMF}, the ordination matrix with feature loadings matrix
#' as attribute \code{"loadings"}.
#'
#' For \code{addNMF}, a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object is returned containing the ordination matrix in
#' \code{reducedDims(x, name)} with the following attributes:
#' \itemize{
#' \item "loadings" which is a matrix containing the feature loadings
#' \item "NMF_output" which is the output of function \code{nmf::NMF}
#' \item "best_fit" which is the result of the best fit if k is a vector of
#' integers
#' }
#'
#' @details
#' The functions \code{getNMF} and \code{addNMF} internally use \code{nmf::NMF}
#' compute the ordination matrix and
#' feature loadings.
#'
#' If k is a vector of integers, NMF output is calculated for all the rank
#' values contained in k, and the best fit is selected based on
#' \code{eval.metric} value.
#'
#' @name addNMF
#'
#' @examples
#' data(GlobalPatterns)
#' tse <- GlobalPatterns
#'
#' # Reduce the number of features
#' tse <- agglomerateByPrevalence(tse, rank = "Phylum")
#'
#' # Run NMF and add the result to reducedDim(tse, "NMF").
#' tse <- addNMF(tse, k = 2, name = "NMF")
#'
#' # Extract feature loadings
#' loadings_NMF <- attr(reducedDim(tse, "NMF"), "loadings")
#' head(loadings_NMF)
#'
#' # Estimate models with number of topics from 2 to 4. Perform 2 runs.
#' tse <- addNMF(tse, k = c(2, 3, 4), name = "NMF_4", nrun = 2)
#'
#' # Extract feature loadings
#' loadings_NMF_4 <- attr(reducedDim(tse, "NMF_4"), "loadings")
#' head(loadings_NMF_4)
#'
NULL
#' @export
#' @rdname addNMF
setMethod("getNMF", "SummarizedExperiment",
function(
x, k = 2, assay.type = "counts", eval.metric = "evar", ...){
.require_package("NMF")
# Both NMF and DelayedArray have method seed(). When running
# NMF::nmf() an error occurs due to wrong method. That is why NMF
# is first loaded into the session.
if( "NMF" %in% (.packages()) ){
detach("package:NMF", unload = TRUE)
}
library("NMF")
.check_assay_present(assay.type, x)
# Calculate NMF ordination
mat <- t(assay(x, assay.type))
res <- NMF::nmf(mat, k, ...)
# Check oif the output includes multiple ordination with different k values.
# If it includes multiple, get the best fit based on certain evaluation
# metric.
if( is(res, "NMF.rank") ){
best_fit <- .get_best_nmf_fit(res, eval.metric)
} else{
best_fit <- res
}
# Get scores and loadings, add loadings and NMF output to attributes of
# scores
scores <- best_fit@fit@W
loadings <- best_fit@fit@H
attr(scores, "loadings") <- t(loadings)
attr(scores, "NMF_output") <- res
# Add best fit if multiple k values
if( is(res, "NMF.rank") ){
attr(scores, "best_fit") <- best_fit
}
# The NMF package is unloaded
detach("package:NMF", unload = TRUE)
# Return scores with loadings, metrics and model as attribute
return(scores)
}
)
#' @export
#' @rdname addNMF
setMethod("addNMF", "SummarizedExperiment",
function(
x, k = 2, assay.type = "counts", eval.metric = "evar", name = "NMF",
...){
# Input checks
if( !.is_a_string(name) ){
stop("'name' must be a non-empty single character value.",
call. = FALSE)
}
# Fit the model
nmf <- getNMF(x, k = k, assay.type = assay.type, ...)
# Add scores matrix with loadings as attribute to reducedDims
x <- .add_values_to_reducedDims(x, name = name, values = nmf)
return(x)
}
)
################################ HELP FUNCTIONS ################################
# This function is for evaluating a fit of NMF models
.get_best_nmf_fit <- function(res, eval.metric){
# Get whether the metric is maximized or minimized
maximize <- c(
"sparseness.basis" = TRUE,
"sparseness.coef" = TRUE,
"rss" = FALSE,
"evar" = TRUE,
"silhouette.coef" = TRUE,
"silhouette.basis" = TRUE,
"cophenetic" = TRUE,
"dispersion" = FALSE
)
maximize <- maximize[ eval.metric ]
if( maximize ){
FUN <- which.max
} else{
FUN <- which.min
}
# Get the index of best fit
measures <- res[["measures"]]
values <- measures[[eval.metric]]
ind <- FUN(values)
# Get the model of best fit
model <- res[["fit"]][[ind]]
return(model)
}