Skip to content

Commit

Permalink
Small bug fixes, add tibble output for Dataframe functions
Browse files Browse the repository at this point in the history
  • Loading branch information
ainefairbrother committed May 13, 2023
1 parent 5d1f5d9 commit e50dfff
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 18 deletions.
8 changes: 4 additions & 4 deletions R/ensemblQueryLDpairEndpoint.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ ensemblQueryLDwithSNPpair = function(rsid1, rsid2, pop="1000GENOMES:phase_3:EUR"

# error handling, if 400 error, set res.temp as NA
if(r$status_code == 400){
print(paste0("Error 400 thrown by httr::GET. One or both of rsid1 (",rsid1,") or rsid2 (", rsid2,")", " may be invalid variant rsID(s). You can check using dbSNP: https://www.ncbi.nlm.nih.gov/snp/."))
print(paste0("Error 400 thrown by httr::GET. One or both of rsid1 (",rsid1,") or rsid2 (", rsid2,")",
" may be invalid variant rsID(s). You can check using dbSNP: https://www.ncbi.nlm.nih.gov/snp/."))
res.temp = NA
} else{

Expand Down Expand Up @@ -108,16 +109,14 @@ ensemblQueryLDwithSNPpair = function(rsid1, rsid2, pop="1000GENOMES:phase_3:EUR"
#' @param in.table data.frame containing SNP pairs. Columns must include `rsid1` for the first member of the pair and `rsid2` for the second member of the pair.
#' @param pop String. Population for which to compute LD. Use `ensemblQueryGetPops()` to retrieve a list of all populations with LD data. Default is 1000GENOMES:phase_3:EUR.
#' @param cores Integer. A value between 1 and 10 is accepted, as this prevents the server returning overload-related errors.
#' @param keep.original.table.row.n Boolean. Set this to TRUE to keep all original rows even if they are NULL in the output (meaning that no data has been found for the rsID pair). Set to FALSE to filter these out and report how many were filtered. Default is FALSE.
#'
#' @return A dataframe.
#' @export
#'
#' @examples
#'ensemblQueryLDwithSNPpairDataframe(
#' in.table=data.frame(rsid1=rep("rs6792369", 10), rsid2=rep("rs1042779", 10)),
#' pop="1000GENOMES:phase_3:EUR",
#' keep.original.table.row.n=FALSE)
#' pop="1000GENOMES:phase_3:EUR")
#'
ensemblQueryLDwithSNPpairDataframe = function(in.table, pop="1000GENOMES:phase_3:EUR", cores=1){ #keep.original.table.row.n=FALSE

Expand Down Expand Up @@ -173,6 +172,7 @@ ensemblQueryLDwithSNPpairDataframe = function(in.table, pop="1000GENOMES:phase_3

}) %>%
do.call("rbind", .) %>%
tibble::tibble() %>%
return()

# # either filter null rows, or keep depending on arg - this can clean up rows where no data was found for the snp pair
Expand Down
32 changes: 18 additions & 14 deletions R/ensemblQueryLDregionEndpoint.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#' ensemblQueryLDwithSNPregion(
#' chr="6",
#' start="25837556",
#' end="25843455",
#' end="25883455",
#' pop="1000GENOMES:phase_3:EUR"
#' )
#'
Expand All @@ -40,10 +40,10 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
# require(vroom)
# require(magrittr)

chr=6
start=25837556
end=25843455 #25843455
pop="1000GENOMES:phase_3:EUR"
# chr=6
# start=25837556
# end=25843455 #25843455
# pop="1000GENOMES:phase_3:EUR"

#------------------------------ check inputs -------------------------------

Expand Down Expand Up @@ -91,15 +91,17 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
`rownames<-`(NULL) %>%
as.data.frame() %>%
dplyr::rename(rsid1=variation1, rsid2=variation2) %>%
dplyr::relocate(rsid1, rsid2, r2, d_prime, population_name) %>%
dplyr::mutate(query_chr=chr, query_start=start, query_end=end) %>%
dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
return()
} else{
# if not 0-row (empty) df, then deal with it normally, format and prepare for return
res.temp %>%
data.frame() %>%
dplyr::arrange(r2) %>%
dplyr::rename(rsid1=variation1, rsid2=variation2) %>%
dplyr::relocate(rsid1, rsid2, r2, d_prime, population_name) %>%
dplyr::mutate(query_chr=chr, query_start=start, query_end=end) %>%
dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
return()
}
# deal with NA search result (result of 400 error) by testing if res.temp is NA
Expand All @@ -114,7 +116,8 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
`rownames<-`(NULL) %>%
as.data.frame() %>%
dplyr::rename(rsid1=variation1, rsid2=variation2) %>%
dplyr::relocate(rsid1, rsid2, r2, d_prime, population_name) %>%
dplyr::mutate(query_chr=chr, query_start=start, query_end=end) %>%
dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
return()
}
}
Expand All @@ -135,7 +138,7 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
#' data.frame(
#' chr=rep(c("6"), 10),
#' start=rep(c("25837556"), 10),
#' end=rep(c("25843455"), 10)
#' end=rep(c("25943455"), 10)
#' ) %>%
#' ensemblQueryLDwithSNPregionDataframe(
#' in.table=.,
Expand Down Expand Up @@ -179,15 +182,16 @@ ensemblQueryLDwithSNPregionDataframe = function(in.table, pop="1000GENOMES:phase
start=in.table$start[x],
end=in.table$end[x],
pop=pop) %>%
tidyr::unnest(cols = c(rsid1, rsid2, r2, d_prime, population_name)) %>%
dplyr::mutate(query_chr = in.table$chr[x],
query_start = in.table$start[x],
query_end = in.table$chr[x]) %>%
dplyr::relocate(query_chr, query_start, query_end) %>%
tidyr::unnest(cols = c(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name)) %>%
# dplyr::mutate(query_chr = in.table$chr[x],
# query_start = in.table$start[x],
# query_end = in.table$chr[x]) %>%
dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
as.data.frame()

}) %>%
do.call("rbind", .) %>%
tibble::tibble() %>%
return(.)

} else{
Expand Down
1 change: 1 addition & 0 deletions R/ensemblQueryLDwindowEndpoint.R
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ ensemblQueryLDwithSNPwindowDataframe = function(in.table, r2=0.8, d.prime=0.8, w

}) %>%
do.call("rbind", .) %>%
tibble::tibble() %>%
return(.)

} else{
Expand Down
31 changes: 31 additions & 0 deletions R/helperFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,34 @@ pingEnsembl = function(){

return(response)
}

# estimateQueriesPerHour = function(cores, n.queries){
#
# # based on a single-core run of 54000 queries (max ensembl API query rate per hour) to ensemblQueryLDwithSNPpairDataframe,
# # the time taken to run 54000 queries was estimated to be 1.937771.
# # Based on this, this function takes the cores and number of queries input by the user and will output the predicted queries per hour that your run will likely spawn
# # this assumes a linear effect of additional cores
# cores=20
# n.queries=1000
# api.limit.hourly = 54000
# time_to_run_54000_in_hours = (116.266269091765/60)
# per_hour_query_rate = 54000/time_to_run_54000_in_hours
# per_minute_query_rate = per_hour_query_rate/60
# per_second_query_rate = per_minute_query_rate/60
# time_for_one_query_seconds = (time_to_run_54000_in_hours/54000)*60*60
#
#
#
# if(time_for_user_queries < 15){
# print(paste(
# "Warning: your query of size",n.queries,"using",cores,"cores may exceed the Ensembl REST API hourly query limit. Consider using fewer cores or splitting your query into smaller chunks."
# ))
# }
#
#
# predicted_requests_per_hour = n.queries/(time_to_run_54000_in_hours/cores)
#
#
#
# }

0 comments on commit e50dfff

Please sign in to comment.