Small bug fixes, add tibble output for Dataframe functions

ainefairbrother · May 13, 2023 · e50dfff · e50dfff
1 parent 5d1f5d9
commit e50dfff
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 18 deletions.
diff --git a/R/ensemblQueryLDpairEndpoint.R b/R/ensemblQueryLDpairEndpoint.R
@@ -58,7 +58,8 @@ ensemblQueryLDwithSNPpair = function(rsid1, rsid2, pop="1000GENOMES:phase_3:EUR"
 
   # error handling, if 400 error, set res.temp as NA
   if(r$status_code == 400){
-    print(paste0("Error 400 thrown by httr::GET. One or both of rsid1 (",rsid1,") or rsid2 (", rsid2,")", " may be invalid variant rsID(s). You can check using dbSNP: https://www.ncbi.nlm.nih.gov/snp/."))
+    print(paste0("Error 400 thrown by httr::GET. One or both of rsid1 (",rsid1,") or rsid2 (", rsid2,")",
+                 " may be invalid variant rsID(s). You can check using dbSNP: https://www.ncbi.nlm.nih.gov/snp/."))
     res.temp = NA
   } else{
 
@@ -108,16 +109,14 @@ ensemblQueryLDwithSNPpair = function(rsid1, rsid2, pop="1000GENOMES:phase_3:EUR"
 #' @param in.table data.frame containing SNP pairs. Columns must include `rsid1` for the first member of the pair and `rsid2` for the second member of the pair.
 #' @param pop String. Population for which to compute LD. Use `ensemblQueryGetPops()` to retrieve a list of all populations with LD data. Default is 1000GENOMES:phase_3:EUR.
 #' @param cores Integer. A value between 1 and 10 is accepted, as this prevents the server returning overload-related errors.
-#' @param keep.original.table.row.n Boolean. Set this to TRUE to keep all original rows even if they are NULL in the output (meaning that no data has been found for the rsID pair). Set to FALSE to filter these out and report how many were filtered. Default is FALSE.
 #'
 #' @return A dataframe.
 #' @export
 #'
 #' @examples
 #'ensemblQueryLDwithSNPpairDataframe(
 #'  in.table=data.frame(rsid1=rep("rs6792369", 10), rsid2=rep("rs1042779", 10)),
-#'  pop="1000GENOMES:phase_3:EUR",
-#'  keep.original.table.row.n=FALSE)
+#'  pop="1000GENOMES:phase_3:EUR")
 #'
 ensemblQueryLDwithSNPpairDataframe = function(in.table, pop="1000GENOMES:phase_3:EUR", cores=1){ #keep.original.table.row.n=FALSE
 
@@ -173,6 +172,7 @@ ensemblQueryLDwithSNPpairDataframe = function(in.table, pop="1000GENOMES:phase_3
 
       }) %>%
         do.call("rbind", .) %>%
+        tibble::tibble() %>%
         return()
 
       # # either filter null rows, or keep depending on arg - this can clean up rows where no data was found for the snp pair

diff --git a/R/ensemblQueryLDregionEndpoint.R b/R/ensemblQueryLDregionEndpoint.R
@@ -21,7 +21,7 @@
 #' ensemblQueryLDwithSNPregion(
 #' chr="6",
 #' start="25837556",
-#' end="25843455",
+#' end="25883455",
 #' pop="1000GENOMES:phase_3:EUR"
 #' )
 #'
@@ -40,10 +40,10 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
   # require(vroom)
   # require(magrittr)
 
-  chr=6
-  start=25837556
-  end=25843455 #25843455
-  pop="1000GENOMES:phase_3:EUR"
+  # chr=6
+  # start=25837556
+  # end=25843455 #25843455
+  # pop="1000GENOMES:phase_3:EUR"
 
   #------------------------------ check inputs -------------------------------
 
@@ -91,15 +91,17 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
         `rownames<-`(NULL) %>%
         as.data.frame() %>%
         dplyr::rename(rsid1=variation1, rsid2=variation2) %>%
-        dplyr::relocate(rsid1, rsid2, r2, d_prime, population_name) %>%
+        dplyr::mutate(query_chr=chr, query_start=start, query_end=end) %>%
+        dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
         return()
     } else{
       # if not 0-row (empty) df, then deal with it normally, format and prepare for return
       res.temp %>%
         data.frame() %>%
         dplyr::arrange(r2) %>%
         dplyr::rename(rsid1=variation1, rsid2=variation2) %>%
-        dplyr::relocate(rsid1, rsid2, r2, d_prime, population_name) %>%
+        dplyr::mutate(query_chr=chr, query_start=start, query_end=end) %>%
+        dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
         return()
     }
     # deal with NA search result (result of 400 error) by testing if res.temp is NA
@@ -114,7 +116,8 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
         `rownames<-`(NULL) %>%
         as.data.frame() %>%
         dplyr::rename(rsid1=variation1, rsid2=variation2) %>%
-        dplyr::relocate(rsid1, rsid2, r2, d_prime, population_name) %>%
+        dplyr::mutate(query_chr=chr, query_start=start, query_end=end) %>%
+        dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
         return()
     }
   }
@@ -135,7 +138,7 @@ ensemblQueryLDwithSNPregion = function(chr, start, end, pop="1000GENOMES:phase_3
 #' data.frame(
 #'   chr=rep(c("6"), 10),
 #'   start=rep(c("25837556"), 10),
-#'   end=rep(c("25843455"), 10)
+#'   end=rep(c("25943455"), 10)
 #' ) %>%
 #'   ensemblQueryLDwithSNPregionDataframe(
 #'     in.table=.,
@@ -179,15 +182,16 @@ ensemblQueryLDwithSNPregionDataframe = function(in.table, pop="1000GENOMES:phase
                                     start=in.table$start[x],
                                     end=in.table$end[x],
                                     pop=pop) %>%
-          tidyr::unnest(cols = c(rsid1, rsid2, r2, d_prime, population_name)) %>%
-          dplyr::mutate(query_chr = in.table$chr[x],
-                        query_start = in.table$start[x],
-                        query_end = in.table$chr[x]) %>%
-          dplyr::relocate(query_chr, query_start, query_end) %>%
+          tidyr::unnest(cols = c(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name)) %>%
+          # dplyr::mutate(query_chr = in.table$chr[x],
+          #               query_start = in.table$start[x],
+          #               query_end = in.table$chr[x]) %>%
+          dplyr::relocate(query_chr, query_start, query_end, rsid1, rsid2, r2, d_prime, population_name) %>%
           as.data.frame()
 
       }) %>%
         do.call("rbind", .) %>%
+        tibble::tibble() %>%
         return(.)
 
     } else{

diff --git a/R/ensemblQueryLDwindowEndpoint.R b/R/ensemblQueryLDwindowEndpoint.R
@@ -191,6 +191,7 @@ ensemblQueryLDwithSNPwindowDataframe = function(in.table, r2=0.8, d.prime=0.8, w
 
       }) %>%
         do.call("rbind", .) %>%
+        tibble::tibble() %>%
         return(.)
 
     } else{

diff --git a/R/helperFunctions.R b/R/helperFunctions.R
@@ -53,3 +53,34 @@ pingEnsembl = function(){
 
   return(response)
 }
+
+# estimateQueriesPerHour = function(cores, n.queries){
+#
+#   # based on a single-core run of 54000 queries (max ensembl API query rate per hour) to ensemblQueryLDwithSNPpairDataframe,
+#   # the time taken to run 54000 queries was estimated to be 1.937771.
+#   # Based on this, this function takes the cores and number of queries input by the user and will output the predicted queries per hour that your run will likely spawn
+#   # this assumes a linear effect of additional cores
+#   cores=20
+#   n.queries=1000
+#   api.limit.hourly = 54000
+#   time_to_run_54000_in_hours = (116.266269091765/60)
+#   per_hour_query_rate = 54000/time_to_run_54000_in_hours
+#   per_minute_query_rate = per_hour_query_rate/60
+#   per_second_query_rate = per_minute_query_rate/60
+#   time_for_one_query_seconds = (time_to_run_54000_in_hours/54000)*60*60
+#
+#
+#
+#   if(time_for_user_queries < 15){
+#     print(paste(
+#       "Warning: your query of size",n.queries,"using",cores,"cores may exceed the Ensembl REST API hourly query limit. Consider using fewer cores or splitting your query into smaller chunks."
+#     ))
+#   }
+#
+#
+#   predicted_requests_per_hour = n.queries/(time_to_run_54000_in_hours/cores)
+#
+#
+#
+# }
+
-Original file line number
+Diff line change
@@ Expand Up @@
           }) %>%
             do.call("rbind", .) %>%
+            tibble::tibble() %>%
             return(.)
         } else{
@@ Expand Down @@