forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_sumstats.R
105 lines (103 loc) · 4.18 KB
/
read_sumstats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#' Determine summary statistics file type and read them into memory
#'
#' @return \code{data.table} of formatted summary statistics
#'
#' @param nrows integer. The (maximal) number of lines to read.
#' If \code{Inf}, will read in all rows.
#' @param standardise_headers Standardise headers first.
#' @inheritParams format_sumstats
#' @inheritParams standardise_header
#' @inheritParams vcf2df
#' @inheritParams read_vcf
#' @inheritParams check_empty_cols
#'
#' @export
#' @importFrom data.table fread as.data.table setkeyv
#' @examples
#' path <- system.file("extdata", "eduAttainOkbay.txt",
#' package = "MungeSumstats"
#' )
#' eduAttainOkbay <- read_sumstats(path = path)
read_sumstats <- function(path,
nrows = Inf,
standardise_headers = FALSE,
samples = 1,
sampled_rows = 1e4L,
nThread = 1,
mapping_file = sumstatsColHeaders) {
if (is.data.frame(path)) {
message("Summary statistics passed as R object.")
sumstats_file <- data.table::as.data.table(path)
if (!is.infinite(nrows)) {
sumstats_file <- sumstats_file[seq(1, nrows), ]
}
} else {
vcf_suffixes <- supported_suffixes(tabular = FALSE,
tabular_compressed = FALSE)
vcf_suffix_regexes <- gsub("\\.", "\\.", paste0(vcf_suffixes, "$"))
is_vcf <- grepl(paste(vcf_suffix_regexes, collapse = "|"), path)
if (isTRUE(is_vcf)) {
sumstats_file <- read_vcf(path = path,
use_params = TRUE,
samples = samples,
sampled_rows = sampled_rows,
as_datatable = TRUE,
nThread = nThread)
} else {
#### Check if tabular 1: infer from file name ####
tab_suffixes <- supported_suffixes(vcf = FALSE,
vcf_compressed = FALSE)
tab_suffix_regexes <- gsub("\\.", "\\.", paste0(tab_suffixes, "$"))
is_tabular <- grepl(paste(tab_suffix_regexes, collapse = "|"), path)
#### Check if tabular 2: infer from data ####
if(isFALSE(is_tabular)){
header <- read_header(path = path)
is_tabular <- check_tabular(header = header)
}
#### Process tabular ####
if (isTRUE(is_tabular)) {
if(endsWith(path,".bgz")){
message("Importing tabular bgz file: ", path)
sumstats_file <- data.table::fread(
text = readLines(con = path),
nThread = nThread,
nrows = nrows)
}else {
message("Importing tabular file: ", path)
sumstats_file <- data.table::fread(
path,
nThread = nThread,
nrows = nrows)
}
} else {
suffixes <- supported_suffixes()
stop(
"Unrecognized file format.\n",
"Must be one of: \n ",
paste(suffixes, collapse = "\n ")
)
}
#### Drop empty cols ####
remove_empty_cols(sumstats_dt = sumstats_file,
sampled_rows = sampled_rows)
}
}
#### Standardise colnames ####
if (isTRUE(standardise_headers)) {
CHR <- NULL;
sumstats_file <-
standardise_sumstats_column_headers_crossplatform(
sumstats_dt = sumstats_file,
mapping_file = mapping_file
)[["sumstats_dt"]]
#### Ensure CHR is character ####
if("CHR" %in% names(sumstats_file)) {
sumstats_file[,CHR:=as.character(CHR)]
}
#### Ensure SNP is the key ####
if("SNP" %in% names(sumstats_file)) {
data.table::setkeyv(sumstats_file, cols = "SNP")
}
}
return(sumstats_file)
}