-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsync_indices.R
103 lines (91 loc) · 3.41 KB
/
sync_indices.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#' Synchronising indices across experiments.
#'
#' Particularly when Hi-C data is loaded from different sources, it occurs that
#' the assigned bin indices don't match between datasets. This function
#' re-indexes a list of experiments to a shared set of bin indices.
#'
#' @param explist A \code{list} of GENOVA \code{contacts} objects of the same
#' resolution.
#'
#' @details Chromosome naming conventions are expected to be equal among input,
#' e.g. all \code{contacts} objects use \code{"chr1"}, \code{"chr2"} etc. or
#' all \code{contacts} objects use \code{"1"}, \code{"2"} etc.
#'
#' The first \code{contacts} object is used as a template. This should only
#' matter when the different experiments have slightly different end-positions
#' at e.g. chromosome ends, in which case the ends of the first
#' \code{contacts} object is used..
#'
#' @return A \code{list} of GENOVA \code{contacts} objects
#' @export
#'
#' @examples
#' \dontrun{
#' # Data loaded from Hi-C Pro
#' exp1 <- load_contacts(signal_path = "exp1_10kb_iced.matrix",
#' indices_path = "exp1_10kb_abs.bed",
#' sample_name = "exp1")
#' # Data loaded from Juicer
#' exp2 <- load_contacts("exp2_10kb.cooler",
#' balancing = TRUE,
#' sample_name = "exp2")
#' synched <- sync_indices(list(exp1, exp2))
#' }
sync_indices <- function(explist) {
nexp <- length(explist)
if (nexp < 1L) {
return(NULL)
}
valid_exps <- vapply(explist, inherits, logical(1), "contacts")
if (!all(valid_exps)) {
stop("Can only synchronise indices of GENOVA contacts objects.")
}
if (nexp == 1L) {
message(paste("Single experiment provided to 'sync_indices()'.",
"No indices to synchronise. Returning input."))
return(explist)
}
res <- resolution(explist)
if (!all(tail(res, -1) == res[[1]])) {
stop("Can only synchronise indices of a single resolution.")
}
idxs <- copy(lapply(explist, `[[`, "IDX"))
# Initialise template from first exp
template <- idxs[[1]]
setnames(template, 4, "exp1")
# For-loop because we update template with subsequent exps
for (i in 2:nexp) {
jointhis <- idxs[[i]]
setnames(jointhis, 4, paste0("exp", i))
# Join on chrom/start
template <- merge.data.table(template, idxs[[i]],
on = c("V1", "V2"), all = TRUE)
# Adopt missing ends from next exp
template$V3 <- ifelse(is.na(template$V3.x), template$V3.y, template$V3.x)
template[, V3.x := NULL]
template[, V3.y := NULL]
}
template[, newbin := seq_len(NROW(template))]
final <- template[, list(V1, V2, V3, V4 = newbin)]
target <- template[["newbin"]]
expcols <- paste0("exp", seq_len(nexp))
new_explist <- lapply(seq_len(nexp), function(i) {
source <- template[, eval(as.symbol(expcols[i]))]
keep <- !is.na(source)
# Make match vectors
match <- vector("integer", max(source, na.rm = TRUE))
match[source[keep]] <- target[keep]
exp <- copy(explist[[i]])
# Set index
exp$IDX <- final
# Lift over old indices to new
exp$MAT[, c("V1", "V2") := list(match[V1], match[V2])]
setkeyv(exp$MAT, c("V1", "V2"))
# Also for centromere entries
exp$CENTROMERES[, c("start", "end") := list(match[start], match[end])]
setkeyv(exp$CENTROMERES, "chrom")
return(exp)
})
names(new_explist) <- names(explist)
return(new_explist)
}