Skip to content

Commit

Permalink
[QP, lib] Add new expression functions for host, domain, subdomain (m…
Browse files Browse the repository at this point in the history
…etabase#41540)

These functions are implemented with hairy regular expressions, and
it's more user-friendly and future-proof to name those functions in MBQL
rather than baking the `regexextract` and regex into the user's query.

It lets us evolve the regexes in the future if we detect a bug, and it
improves the UX since the user sees a meaningful function instead of

    regexextract([My URL Column], "(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)")

Also refactors the regexes somewhat so that they work for emails as well as URLs, and there's always just one layer of `:regex-match-first`.
Previously this was separated into two steps: URL or email to host, and host to (sub)domain.

Part of the follow-up for Extract Column epic metabase#38964.
  • Loading branch information
bshepherdson authored Apr 19, 2024
1 parent 46c4805 commit e435824
Show file tree
Hide file tree
Showing 9 changed files with 309 additions and 199 deletions.
21 changes: 21 additions & 0 deletions frontend/src/metabase-lib/v1/expressions/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,24 @@ export const MBQL_CLAUSES: MBQLClauseMap = {
trim: { displayName: `trim`, type: "string", args: ["string"] },
rtrim: { displayName: `rtrim`, type: "string", args: ["string"] },
ltrim: { displayName: `ltrim`, type: "string", args: ["string"] },
domain: {
displayName: `domain`,
type: "string",
args: ["string"],
requiresFeature: "regex",
},
subdomain: {
displayName: `subdomain`,
type: "string",
args: ["string"],
requiresFeature: "regex",
},
host: {
displayName: `host`,
type: "string",
args: ["string"],
requiresFeature: "regex",
},
// numeric functions
abs: {
displayName: `abs`,
Expand Down Expand Up @@ -460,6 +478,9 @@ export const EXPRESSION_FUNCTIONS = new Set([
"rtrim",
"ltrim",
"length",
"domain",
"subdomain",
"host",
// number
"abs",
"floor",
Expand Down
39 changes: 39 additions & 0 deletions frontend/src/metabase-lib/v1/expressions/helper-text-strings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,45 @@ const HELPER_TEXT_STRINGS: HelpTextConfig[] = [
},
],
},
{
name: "host",
structure: "host",
description: () =>
t`Extracts the host (domain name and TLD, eg. "metabase.com" from "status.metabase.com") from a URL or email`,
args: [
{
name: t`urlOrEmail`,
description: t`The URL or Email column to extract the host from.`,
example: formatIdentifier(t`Email`),
},
],
},
{
name: "domain",
structure: "domain",
description: () =>
t`Extracts the domain name (eg. "metabase") from a URL or email`,
args: [
{
name: t`urlOrEmail`,
description: t`The URL or Email column to extract domain names from.`,
example: formatIdentifier(t`Email`),
},
],
},
{
name: "subdomain",
structure: "subdomain",
description: () =>
t`Extracts the first subdomain (eg. "status" from "status.metabase.com", "" from "bbc.co.uk") from a URL. Ignores "www".`,
args: [
{
name: t`url`,
description: t`The URL column to extract the subdomain from.`,
example: formatIdentifier(t`ProfileImage`),
},
],
},
{
name: "abs",
structure: "abs",
Expand Down
14 changes: 12 additions & 2 deletions src/metabase/legacy_mbql/schema.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,8 @@

(def string-functions
"Functions that return string values. Should match [[StringExpression]]."
#{:substring :trim :rtrim :ltrim :upper :lower :replace :concat :regex-match-first :coalesce :case})
#{:substring :trim :rtrim :ltrim :upper :lower :replace :concat :regex-match-first :coalesce :case
:host :domain :subdomain})

(def ^:private StringExpression
"Schema for the definition of an string expression."
Expand Down Expand Up @@ -557,6 +558,15 @@
(defclause ^{:requires-features #{:expressions :regex}} regex-match-first
s StringExpressionArg, pattern :string)

(defclause ^{:requires-features #{:expressions :regex}} host
s StringExpressionArg)

(defclause ^{:requires-features #{:expressions :regex}} domain
s StringExpressionArg)

(defclause ^{:requires-features #{:expressions :regex}} subdomain
s StringExpressionArg)

(defclause ^{:requires-features #{:expressions}} +
x Addable, y Addable, more (rest Addable))

Expand Down Expand Up @@ -863,7 +873,7 @@
get-hour get-minute get-second))

(mr/def ::StringExpression
(one-of substring trim ltrim rtrim replace lower upper concat regex-match-first coalesce case))
(one-of substring trim ltrim rtrim replace lower upper concat regex-match-first coalesce case host domain subdomain))

(mr/def ::FieldOrExpressionDef
"Schema for anything that is accepted as a top-level expression definition, either an arithmetic expression such as a
Expand Down
91 changes: 88 additions & 3 deletions src/metabase/legacy_mbql/util.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
[metabase.shared.util.i18n :as i18n]
[metabase.util.log :as log]
[metabase.util.malli :as mu]
#?@(:clj
[[metabase.models.dispatch :as models.dispatch]])))
#?@(:clj [[metabase.models.dispatch :as models.dispatch]])))

(defn qualified-name
"Like `name`, but if `x` is a namespace-qualified keyword, returns that a string including the namespace."
Expand Down Expand Up @@ -304,12 +303,98 @@
[:/ x y z & more]
(recur (into [:/ [:/ x y]] (cons z more)))))

(def ^:private host-regex
;; Extracts the "host" from a URL or an email.
;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk.
;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number.
;;
;; For an email, this is generally the part after the @, but it will skip any subdomains:
;; [email protected] -> mycompany.net
;;
;; Referencing the indexes below:
;; 1. Positive lookbehind:
;; Just past one of:
;; 2. @ from an email or URL userinfo@ prefix
;; 3. // from a URL scheme
;; 4. . from a previous subdomain segment
;; 5. Start of string
;; 6. Negative lookahead: don't capture www as part of the domain
;; 7. Main domain segment
;; 8. Ending in a dot
;; 9. Optional short final segment (eg. co in .co.uk)
;; 10. Top-level domain
;; 11. Optional :port, /path, ?query or #hash
;; 12. Anchor to the end
;;1 2 3 4 5 6 7 8 9 10 11 12
#"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)")

(def ^:private domain-regex
;; Deliberately no ^ at the start; there might be several subdomains before this spot.
;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
;; from Mozilla or accept that this regex is a bit best-effort.
;; Referencing the indexes below:
;; 1. Positive lookbehind:
;; Just past one of:
;; 2. @ from an email or URL userinfo@ prefix
;; 3. // from a URL scheme
;; 4. . from a previous subdomain segment
;; 5. Start of string
;; 6. Negative lookahead: don't capture www as the domain
;; 7. One domain segment
;; 8. Positive lookahead:
;; Either:
;; 9. Short final segment (eg. .co.uk)
;; 10. Top-level domain
;; 11. Optional :port, /path, ?query or #hash
;; 12. Anchor to end
;; Or:
;; 13. Top-level domain
;; 14. Optional :port, /path, ?query or #hash
;; 15. Anchor to end
;;1 2 3 4 5 6 7 (8 9 10 11 12| 13 14 15)
#"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)")

(def ^:private subdomain-regex
;; This grabs the first segment that isn't "www", AND excludes the main domain name.
;; See [[domain-regex]] for more details about how those are matched.
;; Referencing the indexes below:
;; 1. Positive lookbehind:
;; Just past one of:
;; 2. @ from an email or URL userinfo@ prefix
;; 3. // from a URL scheme
;; 4. . from a previous subdomain segment
;; 5. Start of string
;; 6. Negative lookahead: don't capture www as the domain
;; 7. Negative lookahead: don't capture the main domain name or part of the TLD
;; That would look like:
;; 8. The next segment we *would* capture as the subdomain
;; 9. Optional short segment, like "co" in .co.uk
;; 10. Top-level domain
;; 11. Optionally more URL things: :port or /path or ?query or #fragment
;; 12. End of string
;; 13. Match the actual subdomain
;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture.
;;1 2 3 4 5 6 7 8 9 10 11 12 13 14
#"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)")

(defn- desugar-host-and-domain [expression]
(lib.util.match/replace expression
[:host column]
(recur [:regex-match-first column (str host-regex)])
[:domain column]
(recur [:regex-match-first column (str domain-regex)])
[:subdomain column]
(recur [:regex-match-first column (str subdomain-regex)])))

(mu/defn desugar-expression :- ::mbql.s/FieldOrExpressionDef
"Rewrite various 'syntactic sugar' expressions like `:/` with more than two args into something simpler for drivers
to compile."
[expression :- ::mbql.s/FieldOrExpressionDef]
(-> expression
desugar-divide-with-extra-args))
desugar-divide-with-extra-args
desugar-host-and-domain))

(defn- maybe-desugar-expression [clause]
(cond-> clause
Expand Down
71 changes: 11 additions & 60 deletions src/metabase/lib/drill_thru/column_extract.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -39,54 +39,6 @@
{:key unit
:display-name (lib.temporal-bucket/describe-temporal-unit unit)}))))

(def ^:private url->host-regex
;; protocol host etc.
#"^(?:[^:/?#]*:?//)?([^/?#]*).*$")

(def ^:private host->domain-regex
;; Deliberately no ^ at the start; there might be several subdomains before this spot.
;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
;; from Mozilla or accept that this regex is a bit best-effort.

;; Skip www domain maybe short tail TLD
#"(?:www\.)?([^\.]+)\.(?:[^\.]{1,3}\.)?[^\.]+$")

(def ^:private email->domain-regex
;; See [[host->domain-regex]] on the challenges of parsing domains with regexes.
;; Referencing the indexes below:
;; 1. Positive lookbehind: Starting after @ or .
;; 2. Negative lookahead: Don't capture www as the domain
;; 3. One domain segment
;; 4. Positive lookahead:
;; Either:
;; 5. Short final segment (eg. .co.uk)
;; 6. Top-level domain
;; 7. Anchor to end
;; Or:
;; 8. Top-level domain
;; 9. Anchor to end
;;1 2 3 (4 5 6 7| 8 9)
#"(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)")

(def ^:private host->subdomain-regex
;; This grabs the first segment that isn't "www", AND excludes the main domain name.
;; See [[host->domain-regex]] for more details about how those are matched.
;; Referencing the indexes below:
;; 1. Only at the start of the input
;; 2. Consume "www." if present
;; 3. Start capturing the subdomain we want
;; 4. Negative lookahead: That subdomain can't be "www"; we don't want to backtrack and find "www".
;; 5. Negative lookahead to make sure this isn't the proper domain:
;; 6. Main domain name
;; 7. Optional short tail (eg. co.uk)
;; 8. Top-level domain, ending the input
;; 9. Matching the actual subdomain
;; 10. And its dot, which is outside the capture.
;;12 34 5 6 7 8 9 10
#"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.")

(defn- regex-available? [metadata-providerable]
((:features (lib.metadata/database metadata-providerable)) :regex))

Expand All @@ -99,13 +51,17 @@
;; If the target database doesn't support :regex feature, return nil.
(not (regex-available? query)) nil
(lib.types.isa/email? column) {:display-name (i18n/tru "Extract domain")
:extractions [{:key :email-domain
:display-name (i18n/tru "Domain")}]}
:extractions [{:key :domain
:display-name (i18n/tru "Domain")}
{:key :host
:display-name (i18n/tru "Host")}]}
(lib.types.isa/URL? column) {:display-name (i18n/tru "Extract domain, subdomain…")
:extractions [{:key :domain
:display-name (i18n/tru "Domain")}
{:key :subdomain
:display-name (i18n/tru "Subdomain")}]}))
:display-name (i18n/tru "Subdomain")}
{:key :host
:display-name (i18n/tru "Host")}]}))

(mu/defn column-extract-drill :- [:maybe ::lib.schema.drill-thru/drill-thru.column-extract]
"Column clicks on temporal columns only.
Expand Down Expand Up @@ -143,15 +99,10 @@
:month-of-year (case-expression #(lib.expression/get-month column) tag 12)
:quarter-of-year (case-expression #(lib.expression/get-quarter column) tag 4)
:year (lib.expression/get-year column)
;; URLs
:domain (-> column
(lib.expression/regex-match-first url->host-regex)
(lib.expression/regex-match-first host->domain-regex))
:subdomain (-> column
(lib.expression/regex-match-first url->host-regex)
(lib.expression/regex-match-first host->subdomain-regex))
;; Emails
:email-domain (lib.expression/regex-match-first column email->domain-regex)))
;; URLs and emails
:domain (lib.expression/domain column)
:subdomain (lib.expression/subdomain column)
:host (lib.expression/host column)))

(defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract
[_query _stage-number {:keys [query stage-number column extractions]} & [tag]]
Expand Down
5 changes: 4 additions & 1 deletion src/metabase/lib/expression.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@
(lib.common/defop - [x y & more])
(lib.common/defop * [x y & more])
;; Kondo gets confused
#_{:clj-kondo/ignore [:unresolved-namespace]}
#_{:clj-kondo/ignore [:unresolved-namespace :syntax]}
(lib.common/defop / [x y & more])
(lib.common/defop case [x y & more])
(lib.common/defop coalesce [x y & more])
Expand Down Expand Up @@ -285,6 +285,9 @@
(lib.common/defop rtrim [s])
(lib.common/defop upper [s])
(lib.common/defop lower [s])
(lib.common/defop host [s])
(lib.common/defop domain [s])
(lib.common/defop subdomain [s])

(mu/defn ^:private expression-metadata :- lib.metadata/ColumnMetadata
[query :- ::lib.schema/query
Expand Down
4 changes: 4 additions & 0 deletions src/metabase/lib/schema/expression/string.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
(mbql-clause/define-tuple-mbql-clause op :- :type/Text
[:schema [:ref ::expression/string]]))

(doseq [op [:host :domain :subdomain]]
(mbql-clause/define-tuple-mbql-clause op :- :type/Text
[:schema [:ref ::expression/string]]))

(mbql-clause/define-tuple-mbql-clause :length :- :type/Integer
[:schema [:ref ::expression/string]])

Expand Down
Loading

0 comments on commit e435824

Please sign in to comment.