Skip to content

Commit

Permalink
Use compressed bytes for caching size comparison
Browse files Browse the repository at this point in the history
Previously we would serialize rows as strings and use that as an
estimate for how big the cache would be. This commit changes that to
use actual compressed bytes to determine size.
  • Loading branch information
iethree committed Nov 6, 2018
1 parent c6b71f8 commit b66a905
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 55 deletions.
15 changes: 15 additions & 0 deletions resources/migrations/000_migrations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5079,3 +5079,18 @@ databaseChangeLog:
name: settings
type: text
remarks: 'Serialized JSON FE-specific settings like formatting, etc. Scope of what is stored here may increase in future.'
#
# Change MySQL/Maria's blob type to LONGBLOB to more closely match what H2 and PostgreSQL support for size limits
#
- changeSet:
id: 97
author: senior
comment: 'Added 0.32.0'
preconditions:
- onFail: MARK_RAN
- dbms: mysql, mariadb
changes:
- modifyDataType:
tableName: query_cache
columnName: results
newDataType: longblob
4 changes: 3 additions & 1 deletion src/metabase/models/interface.clj
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@
(defn compress
"Compress OBJ, returning a byte array."
[obj]
(nippy/freeze obj {:compressor nippy/snappy-compressor}))
(if (bytes? obj)
obj
(nippy/freeze obj {:compressor nippy/snappy-compressor})))

(defn decompress
"Decompress COMPRESSED-BYTES."
Expand Down
16 changes: 15 additions & 1 deletion src/metabase/public_settings.clj
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,27 @@
:type :boolean
:default false)

(def ^:private ^:const global-max-caching-kb
"Although depending on the database, we can support much larger cached values (1GB for PG, 2GB for H2 and 4GB for
MySQL) we are not curretly setup to deal with data of that size. The datatypes we are using will hold this data in
memory and will not truly be streaming. This is a global max in order to prevent our users from setting the caching
value so high it becomes a performance issue. The value below represents 200MB"
(* 200 1024))

(defsetting query-caching-max-kb
(tru "The maximum size of the cache, per saved question, in kilobytes:")
;; (This size is a measurement of the length of *uncompressed* serialized result *rows*. The actual size of
;; the results as stored will vary somewhat, since this measurement doesn't include metadata returned with the
;; results, and doesn't consider whether the results are compressed, as the `:db` backend does.)
:type :integer
:default 1000)
:default 1000
:setter (fn [new-value]
(when (> new-value global-max-caching-kb)
(throw (IllegalArgumentException.
(str
(tru "Failed setting `query-caching-max-kb` to {0}." new-value)
(tru "Values greater than {1} are not allowed." global-max-caching-kb)))))
(setting/set-integer! :query-caching-max-kb new-value)))

(defsetting query-caching-max-ttl
(tru "The absolute maximum time to keep any cached query results, in seconds.")
Expand Down
21 changes: 1 addition & 20 deletions src/metabase/query_processor/middleware/cache.clj
Original file line number Diff line number Diff line change
Expand Up @@ -89,26 +89,8 @@
(boolean (and (public-settings/enable-query-caching)
cache-ttl)))

(defn- results-are-below-max-byte-threshold?
"Measure the size of the `:rows` in QUERY-RESULTS and see whether they're smaller than `query-caching-max-kb`
*before* compression."
^Boolean [{{rows :rows} :data}]
(let [max-bytes (* (public-settings/query-caching-max-kb) 1024)]
;; We don't want to serialize the entire result set since that could explode if the query is one that returns a
;; huge number of rows. (We also want to keep `:rows` lazy.)
;; So we'll serialize one row at a time, and keep a running total of bytes; if we pass the `query-caching-max-kb`
;; threshold, we'll fail right away.
(loop [total-bytes 0, [row & more] rows]
(cond
(> total-bytes max-bytes) false
(not row) true
:else (recur (+ total-bytes (count (str row)))
more)))))

(defn- save-results-if-successful! [query-hash results]
(when (and (= (:status results) :completed)
(or (results-are-below-max-byte-threshold? results)
(log/info "Results are too large to cache." (u/emoji "😫"))))
(when (= (:status results) :completed)
(save-results! query-hash results)))

(defn- run-query-and-save-results-if-successful! [query-hash qp query]
Expand All @@ -127,7 +109,6 @@
(or (cached-results query-hash cache-ttl)
(run-query-and-save-results-if-successful! query-hash qp query))))


(defn maybe-return-cached-results
"Middleware for caching results of a query if applicable.
In order for a query to be eligible for caching:
Expand Down
29 changes: 21 additions & 8 deletions src/metabase/query_processor/middleware/cache_backend/db.clj
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
(ns metabase.query-processor.middleware.cache-backend.db
(:require [metabase.models
(:require [clojure.tools.logging :as log]
[metabase.models
[interface :as models]
[query-cache :refer [QueryCache]]]
[metabase.public-settings :as public-settings]
[metabase.query-processor.middleware.cache-backend.interface :as i]
[metabase.util :as u]
[metabase.util.date :as du]
[toucan.db :as db]))

Expand All @@ -23,17 +25,28 @@
:updated_at [:<= (du/->Timestamp (- (System/currentTimeMillis)
(* 1000 (public-settings/query-caching-max-ttl))))]))


(defn- results-below-max-threshold? [compressed-result-bytes]
(let [max-bytes (* (public-settings/query-caching-max-kb) 1024)]
(> max-bytes (alength compressed-result-bytes))))

(defn- save-results!
"Save the RESULTS of query with QUERY-HASH, updating an existing QueryCache entry
if one already exists, otherwise creating a new entry."
[query-hash results]
(purge-old-cache-entries!)
(or (db/update-where! QueryCache {:query_hash query-hash}
:updated_at (du/new-sql-timestamp)
:results (models/compress results)) ; have to manually call these here since Toucan doesn't call type conversion fns for update-where! (yet)
(db/insert! QueryCache
:query_hash query-hash
:results results))
;; Explicitly compressing the results here rather than having Toucan compress it automatically. This allows us to
;; get the size of the compressed output to decide whether or not to store it.
(let [compressed-results (models/compress results)]
(if (results-below-max-threshold? compressed-results)
(do
(purge-old-cache-entries!)
(or (db/update-where! QueryCache {:query_hash query-hash}
:updated_at (du/new-sql-timestamp)
:results compressed-results)
(db/insert! QueryCache
:query_hash query-hash
:results compressed-results)))
(log/info "Results are too large to cache." (u/emoji "😫"))))
:ok)

(def instance
Expand Down
16 changes: 16 additions & 0 deletions test/metabase/query_processor/middleware/cache_backend/db_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
(ns metabase.query-processor.middleware.cache-backend.db-test
(:require [expectations :refer :all]
[metabase.query-processor.middleware.cache-backend.db :as cache-db]
[metabase.test.util :as tu]))

(defn- in-kb [x]
(* 1024 x))

(expect
(tu/with-temporary-setting-values [query-caching-max-kb 128]
(#'cache-db/results-below-max-threshold? (byte-array (in-kb 100)))))

(expect
false
(tu/with-temporary-setting-values [query-caching-max-kb 1]
(#'cache-db/results-below-max-threshold? (byte-array (in-kb 2)))))
25 changes: 0 additions & 25 deletions test/metabase/query_processor/middleware/cache_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -54,31 +54,6 @@
(tu/with-temporary-setting-values [enable-query-caching true]
(#'cache/is-cacheable? {:cache-ttl nil})))


;;; ------------------------------------- results-are-below-max-byte-threshold? --------------------------------------

(expect
(tu/with-temporary-setting-values [query-caching-max-kb 128]
(#'cache/results-are-below-max-byte-threshold? {:data {:rows [[1 "ABCDEF"]
[3 "GHIJKL"]]}})))

(expect
false
(tu/with-temporary-setting-values [query-caching-max-kb 1]
(#'cache/results-are-below-max-byte-threshold? {:data {:rows (repeat 500 [1 "ABCDEF"])}})))

;; check that `#'cache/results-are-below-max-byte-threshold?` is lazy and fails fast if the query is over the
;; threshold rather than serializing the entire thing
(expect
false
(let [lazy-seq-realized? (atom false)]
(tu/with-temporary-setting-values [query-caching-max-kb 1]
(#'cache/results-are-below-max-byte-threshold? {:data {:rows (lazy-cat (repeat 500 [1 "ABCDEF"])
(do (reset! lazy-seq-realized? true)
[2 "GHIJKL"]))}})
@lazy-seq-realized?)))


;;; ------------------------------------------ End-to-end middleware tests -------------------------------------------

;; if there's nothing in the cache, cached results should *not* be returned
Expand Down

0 comments on commit b66a905

Please sign in to comment.