-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathmethods_matrix_lookup.R
348 lines (309 loc) · 11.3 KB
/
methods_matrix_lookup.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# User facing -------------------------------------------------------------
#' Repeated Hi-C matrix lookup
#'
#' Analyses like aggregrate peak analysis (APA) and paired-end spatial chromatin
#' analysis (PE-SCAn) require to look up regions of a Hi-C matrix repeatedly.
#' This function acts as a common method for the lookup and summary of such
#' repeated lookups for these analyses.
#'
#' @inheritParams PESCAn
#' @param anchors A \code{matrix} with two columns containing pre-computed
#' anchor indices. See the \code{\link[GENOVA]{anchors}} documentation.
#' @param rel_pos An \code{integer} vector indicating relative positions in
#' bins, indicating ranges around anchors to lookup.
#'
#' @return A \code{list} of the same length as \code{explist} wherein list
#' elements contain the results of the repeated lookup per experiment.
#'
#' @details For each row in the \code{anchors} argument a region of the Hi-C
#' matrix is looked up corresponding to that anchor. This data is then
#' summarised by taking the mean of each position relative to the anchor
#' across all the anchors.
#'
#' Anchors are subject to a filtering step wherein anchors are discarded when
#' they are within \code{rel_pos} range of a chromosome start or end. This
#' ensures the anchors all report data from the same chromosome in the x- or
#' y-direction.
#'
#' For shifted anchors, an attempt is made to shift the anchors in the
#' opposite direction before they are discarded.
#'
#' When a region corresponding to a non-shifted anchor is looked up and is
#' found to have no contacts within that region, it is discarded. Shifted
#' regions are only discarded when the corresponding non-shifted anchor is
#' discarded.
#'
#' Anchors typically contain a '\code{type}' attribute which informs
#' \code{rep_mat_lookup} how the lookup should occur. The \code{APA},
#' \code{PESCAn} and \code{ARA} anchors look up regions of dimensions
#' \code{length(rel_pos)} x \code{length(rel_pos)}. The \code{ARA} anchors
#' transpose these square regions when given a '\code{-}' direction before
#' summary occurs. The \code{ATA} anchors look up \code{anchors[, 2] -
#' anchors[, 1]} sized square regions and resizes these to a
#' \code{max(rel_pos)} square region through bilinear interpolation before
#' summary.
#'
#' @section Resolution recommendation: 10kb-40kb
#'
#' @export
#'
#' @family aggregate repeated matrix lookup analyses
rep_mat_lookup <- function(explist, anchors, rel_pos, shift = 0,
outlier_filter = c(0, 1), raw = FALSE) {
# Finish off anchors
anchors <- anchors_finish(explist[[1]]$IDX, anchors, rel_pos, shift)
anch_id <- eval(attr(anchors, "anch_id"))
shft_id <- eval(attr(anchors, "shft_id"))
dnames <- format(rel_pos * attr(explist[[1]], "res"), trim = TRUE)
rawnames <- if (is.null(rownames(anchors))) {
paste0(anchors[anch_id, 1], ";", anchors[anch_id, 2])
} else {
rownames(anchors)
}
group <- if ("group" %in% names(attributes(anchors))) {
inverse.rle(attr(anchors, "group"))
} else {
NULL
}
# Set data.table core usage
dt.cores <- data.table::getDTthreads()
on.exit(data.table::setDTthreads(dt.cores))
data.table::setDTthreads(1)
# Decide on engine
run_engine <- switch(attr(anchors, "type"),
"TADs" = "engine_resizing",
"ARA" = "engine_flipping",
"engine_default")
run_engine <- getFromNamespace(run_engine, "GENOVA")
# Loop over experiments, perform matrix lookup
results <- lapply(seq_along(explist), function(i) {
# Lookup matrices
master <- run_engine(explist[[i]]$MAT, anchors, rel_pos)
arr <- master[anch_id,,, drop = FALSE]
mat_mu <- summarise_lookup(arr, outlier_filter,
group = group[anch_id])
dimnames(mat_mu$mat) <- list(
rev(dnames), dnames, unique(group[anch_id])
)[seq_along(dim(mat_mu$mat))]
if (raw) {
dimnames(arr) <- list(rawnames[anch_id], rev(dnames), dnames)
arr <- arr[mat_mu$keep, , , drop = FALSE]
attr(arr, "group") <- group[anch_id][mat_mu$keep]
} else {
arr <- NULL
}
# Calculate shifted values
if (shift > 0) {
# shifted_arr <- run_engine(explist[[i]]$ICE, shift_anchors, rel_pos)
shifted_arr <- master[shft_id,,, drop = FALSE]
shifted_mu <- summarise_lookup(shifted_arr,
outlier_filter,
keep = mat_mu$keep,
group = group[shft_id]
)$mat
dimnames(shifted_mu) <- dimnames(mat_mu$mat)
if (raw) {
shifted_arr <- shifted_arr[mat_mu$keep, , , drop = FALSE]
dimnames(shifted_arr) <- dimnames(arr)
attr(shifted_arr, "group") <- group[shft_id][mat_mu$keep]
} else {
shifted_arr <- NULL
}
} else {
shifted_arr <- NULL
shifted_mu <- NULL
}
# Calculate observed over expected
obsexp <- if (!is.null(shifted_mu)) {
mat_mu$mat / median(shifted_mu, na.rm = TRUE)
} else {
NULL
}
results <- list(
obsexp = obsexp,
signal = mat_mu$mat,
signal_raw = arr,
shifted = shifted_mu,
shifted_raw = shifted_arr
)
null_results <- vapply(results, is.null, logical(1))
return(
results[!null_results]
)
})
# Format experiment names
names(results) <- if (is.null(names(explist))) {
vapply(explist, function(exp) {
attr(exp, "samplename")
}, character(1L))
} else {
names(explist)
}
merge_res <- names(results[[1]])
results <- lapply(merge_res, function(res) {
resname <- res
objects <- lapply(results, `[[`, res)
# Return array if rawdata
if (resname %in% c("shifted_raw", "signal_raw")) {
return(objects)
}
# Simplify matrix to array
newobject <- do.call(c, objects)
dim(newobject) <- c(dim(objects[[1]]),
length(results))
dimnames(newobject) <- c(dimnames(objects[[1]]),
list(names(results)))
newobject
})
names(results) <- merge_res
results
}
# Internals --------------------------------------------------------
#' Engine for repeated matrix lookups
#'
#' It looks up square parts of equal size in the Hi-C matrix of a single experiment.
#'
#' @param MAT The Hi-C matrix slot of a GENOVA experiment object.
#' @inheritParams rep_mat_lookup
#'
#' @return A threedimensional \code{array} wherein the first dimension is
#' parallel to the rows in \code{anchors}, and the second and third dimensions
#' are of length \code{m}.
#'
#' @seealso \code{\link[GENOVA]{rep_mat_lookup}}
#'
#' @keywords internal
engine_default <- function(MAT, anchors, rel_pos) {
class(anchors) <- "matrix"
# Setup indices
grid <- expand.grid(rel_pos, rel_pos)
idx <- expand.grid(seq_len(nrow(anchors)), seq_len(nrow(grid)))
idx <- list(grid[idx$Var2, 1] + anchors[idx$Var1, 1],
grid[idx$Var2, 2] + anchors[idx$Var1, 2])
# Get data
mats <- MAT[idx, V3]
dim(mats) <- c(nrow(anchors), length(rel_pos)[c(1, 1)])
mats
}
#' @keywords internal
engine_flipping <- function(MAT, anchors, rel_pos) {
mats <- engine_default(MAT = MAT, anchors = anchors, rel_pos = rel_pos)
# Split off reverse orientation
dir <- inverse.rle(attr(anchors, "dir"))
dir <- which(dir == "-")
seq <- seq_along(rel_pos)
mats[dir, seq, seq] <- aperm(mats[dir, rev(seq), rev(seq)], c(1,3,2))
attr(mats, "dir") <- attr(anchors, "dir")
mats
}
#' Engine for repeated matrix lookup and resizing
#'
#' It looks up square parts of unequal size around the diagonal of a single Hi-C
#' experiment and resizes these to a single size.
#'
#' @param ICE The Hi-C matrix slot of a GENOVA experiment object.
#' @param anchors A \code{matrix} with two columns containing pre-computed
#' indices between which a square is looked up.
#' @param rel_pos A \code{integer} sequence ranging from \code{[0-n]} wherein
#' \code{n} is the target size to which individual squares are resized.
#'
#' \code{anchors} specify two points on the diagonal between which the region
#' should be looked up.
#'
#' @return A threedimensional \code{array} wherein the first dimension is
#' parallel to the rows in \code{anchors}, and the second and third dimensions
#' are of length \code{m}.
#'
#' @seealso \code{\link[GENOVA]{rep_mat_lookup}}
#'
#' @keywords internal
engine_resizing <- function(MAT, anchors, rel_pos) {
template <- matrix(0, tail(rel_pos, 1), tail(rel_pos, 1))
coords <- as.matrix(expand.grid(rel_pos, rel_pos))
max_pos <- max(rel_pos)
# Array transpose to match rep_mat_lookup expectation
base::aperm(vapply(seq_len(nrow(anchors)), function(j) {
# Prep parameters
i <- seq.int(anchors[j, 1], anchors[j, 2])
len <- length(i)
seqs <- as.double(seq.int(len))
min <- i[1] - 1
# Lookup matrix
mat <- MAT[CJ(V1 = i, V2 = i),
dt_matrix(V3, V1, V2, len, min),
nomatch = NULL]
# Interpolate coordinates
xy <- (coords - 1) * (len - 1) / (max_pos - 1) + 1
x <- xy[, 1]
y <- xy[, 2]
# Match to current coordinates
dx <- x - {flx <- floor(x)}
dy <- y - {fly <- floor(y)}
# Mellow out the extremes
dx[flx == len] <- 1
dy[fly == len] <- 1
flx[flx == len] <- len - 1
fly[fly == len] <- len - 1
# Interpolate values
template[coords] <-
mat[cbind(flx, fly)] * (1 - dx) * (1 - dy) +
mat[cbind(flx + 1, fly)] * dx * (1 - dy) +
mat[cbind(flx, fly + 1)] * (1 - dx) * dy +
mat[cbind(flx + 1, fly + 1)] * dx * dy
template
}, template), c(3, 1, 2))
}
#' Summarise the results of a repeated matrix lookup
#'
#' This function either filters out all the array slices that are solely
#' composed of \code{NA}s or those specified. It sets all other \code{NA}s to
#' \code{0} and then takes the mean along the first dimension of a
#' threedimensional array.
#'
#' @inheritParams APA
#' @param array A threedimensional \code{array}.
#' @param keep A \code{logical} vector of length \code{dim(array)[1]} indicated
#' which slices to keep. If \code{NULL}, independently filters out
#' all-\code{NA} slices.
#'
#' @return A \code{list} of length 2 with a \code{matrix} giving mean values of
#' \code{array} along the first dimension and the \code{keep} as used in the
#' calculation of the means.
#'
#' @seealso \code{\link[GENOVA]{rep_mat_lookup}}
#'
#' @keywords internal
summarise_lookup <- function(array, outlier_filter = c(0, 1),
keep = NULL, group = NULL) {
if (is.null(keep)) {
keep <- rowSums(array, na.rm = TRUE) != 0
}
if (is.null(group)) {
group <- rep.int(1, dim(array)[1])
}
group <- group[keep]
grouplvls <- unique(group)
# Remove all-NA slices, set other NAs to 0
array <- array[keep, , , drop = FALSE]
array[is.na(array)] <- 0
# Do outlier filtering
if (!identical(outlier_filter, c(0, 1))) {
outlier_filter <- sort(outlier_filter)
thres <- quantile(array, outlier_filter)
if (outlier_filter[2] != 1) {
array <- pmin(array, thres[2])
}
if (outlier_filter[1] != 0) {
array <- pmax(array, thres[1])
}
}
# Summarise
mat <- vapply(grouplvls, function(lvl) {
colMeans(array[group == lvl, , , drop = FALSE])
}, matrix(NA_real_, dim(array)[2], dim(array)[3]))
# Drop last dimension if empty
if (tail(dim(mat), 1) == 1L) {
dim(mat) <- head(dim(mat), -1)
}
return(list(mat = mat, keep = keep))
}