reorg technical stuff into directories

Spencerx · Nov 29, 2019 · 5bc2251 · 5bc2251
1 parent 0ac3820
commit 5bc2251
Show file tree

Hide file tree

Showing 17 changed files with 20 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Contribution and feedback are very welcome!
   - [x] [(py)datatable](https://github.com/h2oai/datatable)
   - [x] [spark](https://github.com/apache/spark)
   - [x] [cuDF](https://github.com/rapidsai/cudf)
-  - [x] [ClickHouse](https://github.com/yandex/ClickHouse)
+  - [x] [ClickHouse](https://github.com/yandex/ClickHouse) (`join` not yet added)
 
 More solutions has been proposed. Some of them are not yet mature enough to address benchmark questions well enough (e.g. [modin](https://github.com/h2oai/db-benchmark/issues/38)). Others haven't been yet evaluated or implemented. Status of all can be tracked in dedicated [issues labelled as _new solution_](https://github.com/h2oai/db-benchmark/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+solution%22) in project repository.
 
@@ -33,7 +33,7 @@ More solutions has been proposed. Some of them are not yet mature enough to addr
 - if solution uses python create new `virtualenv` as `$solution/py-$solution`, example for `pandas` use `virtualenv pandas/py-pandas --python=/usr/bin/python3.6`
 - install every solution (if needed activate each `virtualenv`)
 - edit `run.conf` to define solutions and tasks to benchmark
-- generate data, for `groupby` use `Rscript groupby-datagen.R 1e7 1e2 0 0` to create `G1_1e7_1e2_0_0.csv`, re-save to binary data where needed, create `data` directory and keep all data files there
+- generate data, for `groupby` use `Rscript _data/groupby-datagen.R 1e7 1e2 0 0` to create `G1_1e7_1e2_0_0.csv`, re-save to binary data where needed, create `data` directory and keep all data files there
 - edit `data.csv` to define data sizes to benchmark using `active` flag
 - start benchmark with `./run.sh`
 
@@ -55,7 +55,7 @@ More solutions has been proposed. Some of them are not yet mature enough to addr
 # Example environment
 
 - setting up r3-8xlarge: 244GB RAM, 32 cores: [Amazon EC2 for beginners](https://github.com/Rdatatable/data.table/wiki/Amazon-EC2-for-beginners)  
-- (slightly outdated) full reproduce script on clean Ubuntu 16.04: [repro.sh](https://github.com/h2oai/db-benchmark/blob/master/repro.sh)
+- (slightly outdated) full reproduce script on clean Ubuntu 16.04: [_utils/repro.sh](https://github.com/h2oai/db-benchmark/blob/master/_utils/repro.sh)
 
 # Acknowledgment
 

diff --git a/benchplot-dict.R → _benchplot/benchplot-dict.R b/benchplot-dict.R → _benchplot/benchplot-dict.R
diff --git a/benchplot.R → _benchplot/benchplot.R b/benchplot.R → _benchplot/benchplot.R
diff --git a/groupby-datagen.R → _data/groupby-datagen.R b/groupby-datagen.R → _data/groupby-datagen.R
diff --git a/join-datagen.R → _data/join-datagen.R b/join-datagen.R → _data/join-datagen.R
diff --git a/ga.html → _report/ga.html b/ga.html → _report/ga.html
diff --git a/publish.sh → _report/publish.sh b/publish.sh → _report/publish.sh
diff --git a/report.R → _report/report.R b/report.R → _report/report.R
diff --git a/answers-validation.R → _utils/answers-validation.R b/answers-validation.R → _utils/answers-validation.R
diff --git a/repro.sh → _utils/repro.sh b/repro.sh → _utils/repro.sh
diff --git a/ch.sh → clickhouse/ch.sh b/ch.sh → clickhouse/ch.sh
diff --git a/clickhouse-exec.sh → clickhouse/exec.sh b/clickhouse-exec.sh → clickhouse/exec.sh
@@ -2,11 +2,11 @@
 set -e
 
 if [ "$#" -ne 2 ]; then
-    echo "usage: ./clickhouse-exec.sh groupby G1_1e7_1e2_0_0";
+    echo "usage: ./clickhouse/exec.sh groupby G1_1e7_1e2_0_0";
     exit 1
 fi;
 
-source ch.sh
+source ./clickhouse/ch.sh
 
 # start server
 ch_start

diff --git a/history.Rmd b/history.Rmd
@@ -5,7 +5,7 @@ output:
     self_contained: no
     toc: false
     includes:
-      in_header: ga.html
+      in_header: _report/ga.html
 ---
 ```{r render, include=FALSE}
 # Rscript -e 'rmarkdown::render("history.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path
@@ -17,7 +17,7 @@ knitr::opts_chunk$set(echo=FALSE, cache=FALSE)
 
 ```{r init}
 library(lattice)
-source("report.R")
+source("./_report/report.R")
 ld = time_logs()[task=="groupby" & substr(data,1,2)=="G1"] # substr for clickhouse on-disk reported as G2
 ld[, "question" := droplevels(question)]
 recent_nodename = ld[script_recent==TRUE, unique(nodename)]

diff --git a/index.Rmd b/index.Rmd
@@ -4,7 +4,7 @@ output:
   html_document:
     self_contained: no
     includes:
-      in_header: ga.html
+      in_header: _report/ga.html
 ---
 ```{r render, include=FALSE}
 # Rscript -e 'rmarkdown::render("index.Rmd", output_dir="public")' && xdg-open public/index.html
@@ -31,10 +31,10 @@ make_sorters <- function(data) { ## till not on cran https://github.com/smartins
   sprintf("function(attr) {\nvar sortAs = $.pivotUtilities.sortAs;\n%s\n}", paste(sorter, collapse="\n"))
 }
 
-source("report.R", chdir=TRUE)
-source("helpers.R", chdir=TRUE)
-source("benchplot.R", chdir=TRUE)
-source("benchplot-dict.R", chdir=TRUE)
+source("./_report/report.R", chdir=TRUE)
+source("./helpers.R", chdir=TRUE)
+source("./_benchplot/benchplot.R", chdir=TRUE)
+source("./_benchplot/benchplot-dict.R", chdir=TRUE)
 ld = time_logs()
 lld = ld[script_recent==TRUE]
 lld_nodename = as.character(unique(lld$nodename))
@@ -94,7 +94,7 @@ loop_benchplot(dt_groupby, report_name="groupby", syntax.dict=groupby.syntax.dic
 in_rows = c("1e7","1e8","1e9")
 k_na_sort = c("NA_0_0")
 data_name = paste("J1", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_")
-loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, exceptions=join.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv=c("basic"), title.txt.fun = header_title_fun, question.txt.fun = join_q_title_fun, cutoff = "spark", pending = "Modin")
+loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, exceptions=join.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv=c("basic"), title.txt.fun = header_title_fun, question.txt.fun = join_q_title_fun, cutoff = "spark", pending = c("ClickHouse","Modin"))
 ```
 
 ## Task {.tabset .tabset-fade .tabset-pills}
@@ -218,9 +218,9 @@ rpivotTable::rpivotTable(
 ## Notes
 
 - You are welcome to run this benchmark yourself! all scripts related to setting up environment, data and benchmark are in [repository](https://github.com/h2oai/db-benchmark).  
-- Data used to generate plots on this website can be obtained from [time.csv](./time.csv) (together with [logs.csv](./logs.csv)). See [report.R](https://github.com/h2oai/db-benchmark/blob/master/report.R) for quick introduction how to work with those.  
+- Data used to generate plots on this website can be obtained from [time.csv](./time.csv) (together with [logs.csv](./logs.csv)). See [_report/report.R](https://github.com/h2oai/db-benchmark/blob/master/_report/report.R) for quick introduction how to work with those.
 - We ensure that calculations are not deferred by solution.  
-- We also tested that answers produced from different solutions match each others, for details see [answers-validation.R](https://github.com/h2oai/db-benchmark/blob/master/answers-validation.R).  
+- We also tested that answers produced from different solutions match each others, for details see [_utils/answers-validation.R](https://github.com/h2oai/db-benchmark/blob/master/_utils/answers-validation.R).
 - ClickHouse queries were made against `mergetree` table engine, see [#91](https://github.com/h2oai/db-benchmark/issues/91) for details.  
 
 ## Environment configuration

diff --git a/launcher.R b/launcher.R
@@ -134,7 +134,7 @@ for (s in solutions) { #s = solutions[1]
       ext = file.ext(s)
       if (!length(ext)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", ns))
       cmd = if (ext=="sql") { # only clickhouse for now
-        sprintf("./%s-exec.sh %s %s > %s 2> %s", ns, t, d, out_file, err_file)
+        sprintf("./%s/exec.sh %s %s > %s 2> %s", ns, t, d, out_file, err_file)
       } else sprintf("./%s/%s-%s.%s > %s 2> %s", ns, t, ns, ext, out_file, err_file)
       venv = if (ext=="py") {
         # https://stackoverflow.com/questions/52779016/conda-command-working-in-command-prompt-but-not-in-bash-script

diff --git a/run.sh b/run.sh
@@ -11,7 +11,7 @@ export BATCH=$(date +%s)
 if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH aborted. 'stop' file exists, should be removed before calling 'run.sh'" && exit; fi;
 
 # confirm clickhouse is not running
-source ./ch.sh
+source ./clickhouse/ch.sh
 ch_installed && ch_active && echo "# Benchmark run $BATCH aborted. clickhouse-server is running, shut it down before calling 'run.sh'" && exit;
 
 # confirm swap disabled
@@ -75,7 +75,7 @@ $DO_REPORT && $DO_PUBLISH \
   && [ $(wc -l report-done | awk '{print $1}') -eq 3 ] \
   && [ -f ./token ] \
   && echo "# Publishing report" \
-  && ((./publish.sh && echo "# Benchmark results has been published") || echo "# Benchmark publish script failed")
+  && ((./_report/publish.sh && echo "# Benchmark results has been published") || echo "# Benchmark publish script failed")
 
 # remove run lock file
 rm -f ./run.lock

diff --git a/tech.Rmd b/tech.Rmd
@@ -5,7 +5,7 @@ output:
     self_contained: no
     toc: true
     includes:
-      in_header: ga.html
+      in_header: _report/ga.html
 ---
 ```{r render, include=FALSE}
 # Rscript -e 'rmarkdown::render("tech.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path
@@ -17,7 +17,7 @@ knitr::opts_chunk$set(echo=FALSE, cache=FALSE)
 
 ```{r init}
 library(lattice)
-source("report.R")
+source("./_report/report.R")
 ld = time_logs()
 recent_nodename = ld[script_recent==TRUE, unique(nodename)]
 stopifnot(length(recent_nodename)==1L)