forked from metabase/metabase
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[QP, lib] Add new expression functions for host, domain, subdomain (m…
…etabase#41540) These functions are implemented with hairy regular expressions, and it's more user-friendly and future-proof to name those functions in MBQL rather than baking the `regexextract` and regex into the user's query. It lets us evolve the regexes in the future if we detect a bug, and it improves the UX since the user sees a meaningful function instead of regexextract([My URL Column], "(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)") Also refactors the regexes somewhat so that they work for emails as well as URLs, and there's always just one layer of `:regex-match-first`. Previously this was separated into two steps: URL or email to host, and host to (sub)domain. Part of the follow-up for Extract Column epic metabase#38964.
- Loading branch information
1 parent
46c4805
commit e435824
Showing
9 changed files
with
309 additions
and
199 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,8 +11,7 @@ | |
[metabase.shared.util.i18n :as i18n] | ||
[metabase.util.log :as log] | ||
[metabase.util.malli :as mu] | ||
#?@(:clj | ||
[[metabase.models.dispatch :as models.dispatch]]))) | ||
#?@(:clj [[metabase.models.dispatch :as models.dispatch]]))) | ||
|
||
(defn qualified-name | ||
"Like `name`, but if `x` is a namespace-qualified keyword, returns that a string including the namespace." | ||
|
@@ -304,12 +303,98 @@ | |
[:/ x y z & more] | ||
(recur (into [:/ [:/ x y]] (cons z more))))) | ||
|
||
(def ^:private host-regex | ||
;; Extracts the "host" from a URL or an email. | ||
;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk. | ||
;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number. | ||
;; | ||
;; For an email, this is generally the part after the @, but it will skip any subdomains: | ||
;; [email protected] -> mycompany.net | ||
;; | ||
;; Referencing the indexes below: | ||
;; 1. Positive lookbehind: | ||
;; Just past one of: | ||
;; 2. @ from an email or URL userinfo@ prefix | ||
;; 3. // from a URL scheme | ||
;; 4. . from a previous subdomain segment | ||
;; 5. Start of string | ||
;; 6. Negative lookahead: don't capture www as part of the domain | ||
;; 7. Main domain segment | ||
;; 8. Ending in a dot | ||
;; 9. Optional short final segment (eg. co in .co.uk) | ||
;; 10. Top-level domain | ||
;; 11. Optional :port, /path, ?query or #hash | ||
;; 12. Anchor to the end | ||
;;1 2 3 4 5 6 7 8 9 10 11 12 | ||
#"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)") | ||
|
||
(def ^:private domain-regex | ||
;; Deliberately no ^ at the start; there might be several subdomains before this spot. | ||
;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk. | ||
;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops. | ||
;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list | ||
;; from Mozilla or accept that this regex is a bit best-effort. | ||
;; Referencing the indexes below: | ||
;; 1. Positive lookbehind: | ||
;; Just past one of: | ||
;; 2. @ from an email or URL userinfo@ prefix | ||
;; 3. // from a URL scheme | ||
;; 4. . from a previous subdomain segment | ||
;; 5. Start of string | ||
;; 6. Negative lookahead: don't capture www as the domain | ||
;; 7. One domain segment | ||
;; 8. Positive lookahead: | ||
;; Either: | ||
;; 9. Short final segment (eg. .co.uk) | ||
;; 10. Top-level domain | ||
;; 11. Optional :port, /path, ?query or #hash | ||
;; 12. Anchor to end | ||
;; Or: | ||
;; 13. Top-level domain | ||
;; 14. Optional :port, /path, ?query or #hash | ||
;; 15. Anchor to end | ||
;;1 2 3 4 5 6 7 (8 9 10 11 12| 13 14 15) | ||
#"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)") | ||
|
||
(def ^:private subdomain-regex | ||
;; This grabs the first segment that isn't "www", AND excludes the main domain name. | ||
;; See [[domain-regex]] for more details about how those are matched. | ||
;; Referencing the indexes below: | ||
;; 1. Positive lookbehind: | ||
;; Just past one of: | ||
;; 2. @ from an email or URL userinfo@ prefix | ||
;; 3. // from a URL scheme | ||
;; 4. . from a previous subdomain segment | ||
;; 5. Start of string | ||
;; 6. Negative lookahead: don't capture www as the domain | ||
;; 7. Negative lookahead: don't capture the main domain name or part of the TLD | ||
;; That would look like: | ||
;; 8. The next segment we *would* capture as the subdomain | ||
;; 9. Optional short segment, like "co" in .co.uk | ||
;; 10. Top-level domain | ||
;; 11. Optionally more URL things: :port or /path or ?query or #fragment | ||
;; 12. End of string | ||
;; 13. Match the actual subdomain | ||
;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture. | ||
;;1 2 3 4 5 6 7 8 9 10 11 12 13 14 | ||
#"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)") | ||
|
||
(defn- desugar-host-and-domain [expression] | ||
(lib.util.match/replace expression | ||
[:host column] | ||
(recur [:regex-match-first column (str host-regex)]) | ||
[:domain column] | ||
(recur [:regex-match-first column (str domain-regex)]) | ||
[:subdomain column] | ||
(recur [:regex-match-first column (str subdomain-regex)]))) | ||
|
||
(mu/defn desugar-expression :- ::mbql.s/FieldOrExpressionDef | ||
"Rewrite various 'syntactic sugar' expressions like `:/` with more than two args into something simpler for drivers | ||
to compile." | ||
[expression :- ::mbql.s/FieldOrExpressionDef] | ||
(-> expression | ||
desugar-divide-with-extra-args)) | ||
desugar-divide-with-extra-args | ||
desugar-host-and-domain)) | ||
|
||
(defn- maybe-desugar-expression [clause] | ||
(cond-> clause | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.