Skip to content

Commit

Permalink
Add steps for setup data for performance testing (zero-one-group#247)
Browse files Browse the repository at this point in the history
Co-authored-by: Anthony Khong <[email protected]>
  • Loading branch information
agilecreativity and anthony-khong authored Oct 10, 2020
1 parent 770a24f commit 23f4292
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
11 changes: 10 additions & 1 deletion docs/cookbook/cookbook-10.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(ns geni.cookbook-10
(:require
[clojure.java.io]
[clojure.java.io :as io]
[clojure.java.shell]
[zero-one.geni.core :as g]
[zero-one.geni.ml :as ml]))
Expand All @@ -11,6 +11,15 @@

(def dummy-data-path "data/performance-benchmark-data")

;; We need to first pre-populate test data before we can read if not already done
;; NOTE: this step will take sometime to complete, please be patient!
(if-not (-> dummy-data-path io/file .exists)
(do
(println "Creating sample data for performance testing.")
(load-file "docs/cookbook/cookbook-performance-data.clj")
(println "Test data for performance testing is ready."))
(println (format "Test data exists at %s" dummy-data-path)))

(def transactions (g/read-parquet! dummy-data-path))

(g/count transactions)
Expand Down
46 changes: 46 additions & 0 deletions docs/cookbook/cookbook-performance-data.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
(require '[zero-one.geni.core :as g])

(def data-path "data/performance-benchmark-data")

(def skeleton-df
(g/cache (g/table->dataset (repeat (int 2e6) [1]) [:dummy])))

(defn transaction-id-col []
(g/concat (g/str (g/random-int))
(g/lit "-")
(g/str (g/random-int))
(g/lit "-")
(g/str (g/random-int))))

(def date-col
(g/concat :year (g/lit "-") :month (g/lit "-") :day))

(def max-days {1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 30})

(doall
(for [month (range 1 13)]
(-> skeleton-df
(g/select
{:trx-id (transaction-id-col)
:member-id (g/int (g/rexp 5e-6))
:quantity (g/int (g/inc (g/rexp)))
:price (g/pow 2 (g/random-int 16 20))
:style-id (g/int (g/rexp 1e-2))
:brand-id (g/int (g/rexp 1e-2))
:year 2019
:month month
:day (g/random-int 1 (inc (max-days month)))})
(g/with-column :date (g/to-date date-col))
(g/coalesce 1)
(g/write-parquet! data-path {:mode "append"}))))

0 comments on commit 23f4292

Please sign in to comment.