forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_two_step_col.R
66 lines (65 loc) · 2.63 KB
/
check_two_step_col.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#' Ensure that CHR:BP aren't merged into 1 column
#'
#' @param sumstats_dt data table obj of the summary statistics
#' file for the GWAS
#' @param path Filepath for the summary statistics file to be formatted
#' @returns list containing sumstats_dt, the modified summary
#' statistics data table object
#' @keywords internal
#' @importFrom data.table tstrsplit
#' @importFrom data.table :=
check_two_step_col <- function(sumstats_dt, path) {
# get col headers
col_headers <- names(sumstats_dt)
# Obtain a row of the actual data
row_of_data <- as.character(sumstats_dt[1, ])
twoStepCol <- grep(".*:.*", row_of_data)
# in case there are more than one column with ":", just take first one
if (length(twoStepCol) > 1) {
# sort to get most recent genome build by
# default (cols: SNP_hg19, SNP_hg18)
keep_col <- sort(col_headers[twoStepCol], decreasing = TRUE)[1]
drop_cols <- sort(col_headers[twoStepCol], decreasing = TRUE)[-1]
msg <- paste0(
"Warning: Multiple columns in the sumstats file seem to ",
"relate to Chromosome:Base Pair position.\nThe column ",
keep_col, " will be kept whereas the column(s) ",
drop_cols, " will be removed.\nIf this is not the correct ",
"column to keep, please remove all incorrect columns from ",
"those listed here before \nrunning `format_sumstats()`."
)
message(msg)
# Get data without dropped
sumstats_dt[, (drop_cols) := NULL]
twoStepCol <- which(col_headers == keep_col)
}
if (length(twoStepCol)) {
keep_col <- col_headers[twoStepCol]
# split out col into separate values, keep names
format <- strsplit(keep_col, ":")[[1]]
if (length(format) != 2) { # check : and underscore in name
format <- strsplit(keep_col, "_")[[1]]
}
if (length(format) != 2) { # If neither found assign name
format <- c("CHR", "BP")
}
# keep ensures that even if certain rows have 3 values
# e.g.16:23609681:ID
# only first two taken
sumstats_dt[, (format) := data.table::tstrsplit(get(keep_col),
split = ":", fixed = TRUE,
keep = c(1, 2),
type.convert = TRUE
)]
# remove combined column
sumstats_dt[, (keep_col) := NULL]
msg <- paste0(
"Column ", keep_col, " has been separated into the columns ",
paste(format, collapse = ", ")
)
message(msg)
return(list("sumstats_dt" = sumstats_dt))
} else {
return(list("sumstats_dt" = sumstats_dt))
}
}