[QP, lib] Add new expression functions for host, domain, subdomain (m…

…etabase#41540) These functions are implemented with hairy regular expressions, and it's more user-friendly and future-proof to name those functions in MBQL rather than baking the `regexextract` and regex into the user's query. It lets us evolve the regexes in the future if we detect a bug, and it improves the UX since the user sees a meaningful function instead of regexextract([My URL Column], "(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)") Also refactors the regexes somewhat so that they work for emails as well as URLs, and there's always just one layer of `:regex-match-first`. Previously this was separated into two steps: URL or email to host, and host to (sub)domain. Part of the follow-up for Extract Column epic metabase#38964.
marregui · Apr 19, 2024 · e435824 · e435824
1 parent 46c4805
commit e435824
Show file tree

Hide file tree

Showing 9 changed files with 309 additions and 199 deletions.
diff --git a/frontend/src/metabase-lib/v1/expressions/config.ts b/frontend/src/metabase-lib/v1/expressions/config.ts
@@ -130,6 +130,24 @@ export const MBQL_CLAUSES: MBQLClauseMap = {
   trim: { displayName: `trim`, type: "string", args: ["string"] },
   rtrim: { displayName: `rtrim`, type: "string", args: ["string"] },
   ltrim: { displayName: `ltrim`, type: "string", args: ["string"] },
+  domain: {
+    displayName: `domain`,
+    type: "string",
+    args: ["string"],
+    requiresFeature: "regex",
+  },
+  subdomain: {
+    displayName: `subdomain`,
+    type: "string",
+    args: ["string"],
+    requiresFeature: "regex",
+  },
+  host: {
+    displayName: `host`,
+    type: "string",
+    args: ["string"],
+    requiresFeature: "regex",
+  },
   // numeric functions
   abs: {
     displayName: `abs`,
@@ -460,6 +478,9 @@ export const EXPRESSION_FUNCTIONS = new Set([
   "rtrim",
   "ltrim",
   "length",
+  "domain",
+  "subdomain",
+  "host",
   // number
   "abs",
   "floor",

diff --git a/frontend/src/metabase-lib/v1/expressions/helper-text-strings.ts b/frontend/src/metabase-lib/v1/expressions/helper-text-strings.ts
@@ -393,6 +393,45 @@ const HELPER_TEXT_STRINGS: HelpTextConfig[] = [
       },
     ],
   },
+  {
+    name: "host",
+    structure: "host",
+    description: () =>
+      t`Extracts the host (domain name and TLD, eg. "metabase.com" from "status.metabase.com") from a URL or email`,
+    args: [
+      {
+        name: t`urlOrEmail`,
+        description: t`The URL or Email column to extract the host from.`,
+        example: formatIdentifier(t`Email`),
+      },
+    ],
+  },
+  {
+    name: "domain",
+    structure: "domain",
+    description: () =>
+      t`Extracts the domain name (eg. "metabase") from a URL or email`,
+    args: [
+      {
+        name: t`urlOrEmail`,
+        description: t`The URL or Email column to extract domain names from.`,
+        example: formatIdentifier(t`Email`),
+      },
+    ],
+  },
+  {
+    name: "subdomain",
+    structure: "subdomain",
+    description: () =>
+      t`Extracts the first subdomain (eg. "status" from "status.metabase.com", "" from "bbc.co.uk") from a URL. Ignores "www".`,
+    args: [
+      {
+        name: t`url`,
+        description: t`The URL column to extract the subdomain from.`,
+        example: formatIdentifier(t`ProfileImage`),
+      },
+    ],
+  },
   {
     name: "abs",
     structure: "abs",

diff --git a/src/metabase/legacy_mbql/schema.cljc b/src/metabase/legacy_mbql/schema.cljc
@@ -380,7 +380,8 @@
 
 (def string-functions
   "Functions that return string values. Should match [[StringExpression]]."
-  #{:substring :trim :rtrim :ltrim :upper :lower :replace :concat :regex-match-first :coalesce :case})
+  #{:substring :trim :rtrim :ltrim :upper :lower :replace :concat :regex-match-first :coalesce :case
+    :host :domain :subdomain})
 
 (def ^:private StringExpression
   "Schema for the definition of an string expression."
@@ -557,6 +558,15 @@
 (defclause ^{:requires-features #{:expressions :regex}} regex-match-first
   s StringExpressionArg, pattern :string)
 
+(defclause ^{:requires-features #{:expressions :regex}} host
+  s StringExpressionArg)
+
+(defclause ^{:requires-features #{:expressions :regex}} domain
+  s StringExpressionArg)
+
+(defclause ^{:requires-features #{:expressions :regex}} subdomain
+  s StringExpressionArg)
+
 (defclause ^{:requires-features #{:expressions}} +
   x Addable, y Addable, more (rest Addable))
 
@@ -863,7 +873,7 @@
           get-hour get-minute get-second))
 
 (mr/def ::StringExpression
-  (one-of substring trim ltrim rtrim replace lower upper concat regex-match-first coalesce case))
+  (one-of substring trim ltrim rtrim replace lower upper concat regex-match-first coalesce case host domain subdomain))
 
 (mr/def ::FieldOrExpressionDef
   "Schema for anything that is accepted as a top-level expression definition, either an arithmetic expression such as a

diff --git a/src/metabase/legacy_mbql/util.cljc b/src/metabase/legacy_mbql/util.cljc
@@ -11,8 +11,7 @@
    [metabase.shared.util.i18n :as i18n]
    [metabase.util.log :as log]
    [metabase.util.malli :as mu]
-   #?@(:clj
-       [[metabase.models.dispatch :as models.dispatch]])))
+   #?@(:clj [[metabase.models.dispatch :as models.dispatch]])))
 
 (defn qualified-name
   "Like `name`, but if `x` is a namespace-qualified keyword, returns that a string including the namespace."
@@ -304,12 +303,98 @@
     [:/ x y z & more]
     (recur (into [:/ [:/ x y]] (cons z more)))))
 
+(def ^:private host-regex
+  ;; Extracts the "host" from a URL or an email.
+  ;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk.
+  ;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number.
+  ;;
+  ;; For an email, this is generally the part after the @, but it will skip any subdomains:
+  ;;   [email protected] -> mycompany.net
+  ;;
+  ;; Referencing the indexes below:
+  ;; 1.  Positive lookbehind:
+  ;;       Just past one of:
+  ;; 2.      @  from an email or URL userinfo@ prefix
+  ;; 3.      // from a URL scheme
+  ;; 4.      .  from a previous subdomain segment
+  ;; 5.      Start of string
+  ;; 6.  Negative lookahead: don't capture www as part of the domain
+  ;; 7.  Main domain segment
+  ;; 8.  Ending in a dot
+  ;; 9.  Optional short final segment (eg. co in .co.uk)
+  ;; 10. Top-level domain
+  ;; 11. Optional :port, /path, ?query or #hash
+  ;; 12. Anchor to the end
+  ;;1   2 3  4  5 6        7          8 9                     10         11           12
+  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)")
+
+(def ^:private domain-regex
+  ;; Deliberately no ^ at the start; there might be several subdomains before this spot.
+  ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
+  ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
+  ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
+  ;; from Mozilla or accept that this regex is a bit best-effort.
+  ;; Referencing the indexes below:
+  ;; 1.  Positive lookbehind:
+  ;;       Just past one of:
+  ;; 2.      @  from an email or URL userinfo@ prefix
+  ;; 3.      // from a URL scheme
+  ;; 4.      .  from a previous subdomain segment
+  ;; 5.      Start of string
+  ;; 6.  Negative lookahead: don't capture www as the domain
+  ;; 7.  One domain segment
+  ;; 8.  Positive lookahead:
+  ;;       Either:
+  ;; 9.      Short final segment (eg. .co.uk)
+  ;; 10.     Top-level domain
+  ;; 11.     Optional :port, /path, ?query or #hash
+  ;; 12.     Anchor to end
+  ;;       Or:
+  ;; 13.     Top-level domain
+  ;; 14.     Optional :port, /path, ?query or #hash
+  ;; 15.     Anchor to end
+  ;;1   2 3  4  5 6        7          (8   9                10         11          12|  13         14           15)
+  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)")
+
+(def ^:private subdomain-regex
+  ;; This grabs the first segment that isn't "www", AND excludes the main domain name.
+  ;; See [[domain-regex]] for more details about how those are matched.
+  ;; Referencing the indexes below:
+  ;; 1.  Positive lookbehind:
+  ;;       Just past one of:
+  ;; 2.      @  from an email or URL userinfo@ prefix
+  ;; 3.      // from a URL scheme
+  ;; 4.      .  from a previous subdomain segment
+  ;; 5.      Start of string
+  ;; 6.  Negative lookahead: don't capture www as the domain
+  ;; 7.  Negative lookahead: don't capture the main domain name or part of the TLD
+  ;;       That would look like:
+  ;; 8.      The next segment we *would* capture as the subdomain
+  ;; 9.      Optional short segment, like "co" in .co.uk
+  ;; 10.     Top-level domain
+  ;; 11.     Optionally more URL things: :port or /path or ?query or #fragment
+  ;; 12.     End of string
+  ;; 13. Match the actual subdomain
+  ;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture.
+  ;;1   2 3  4  5 6        7  8           9                    10        11           12 13       14
+  #"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)")
+
+(defn- desugar-host-and-domain [expression]
+  (lib.util.match/replace expression
+    [:host column]
+    (recur [:regex-match-first column (str host-regex)])
+    [:domain column]
+    (recur [:regex-match-first column (str domain-regex)])
+    [:subdomain column]
+    (recur [:regex-match-first column (str subdomain-regex)])))
+
 (mu/defn desugar-expression :- ::mbql.s/FieldOrExpressionDef
   "Rewrite various 'syntactic sugar' expressions like `:/` with more than two args into something simpler for drivers
   to compile."
   [expression :- ::mbql.s/FieldOrExpressionDef]
   (-> expression
-      desugar-divide-with-extra-args))
+      desugar-divide-with-extra-args
+      desugar-host-and-domain))
 
 (defn- maybe-desugar-expression [clause]
   (cond-> clause

diff --git a/src/metabase/lib/drill_thru/column_extract.cljc b/src/metabase/lib/drill_thru/column_extract.cljc
@@ -39,54 +39,6 @@
            {:key          unit
             :display-name (lib.temporal-bucket/describe-temporal-unit unit)}))))
 
-(def ^:private url->host-regex
-  ;;    protocol       host    etc.
-  #"^(?:[^:/?#]*:?//)?([^/?#]*).*$")
-
-(def ^:private host->domain-regex
-  ;; Deliberately no ^ at the start; there might be several subdomains before this spot.
-  ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
-  ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
-  ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
-  ;; from Mozilla or accept that this regex is a bit best-effort.
-
-  ;; Skip www  domain   maybe short tail  TLD
-  #"(?:www\.)?([^\.]+)\.(?:[^\.]{1,3}\.)?[^\.]+$")
-
-(def ^:private email->domain-regex
-  ;; See [[host->domain-regex]] on the challenges of parsing domains with regexes.
-  ;; Referencing the indexes below:
-  ;; 1. Positive lookbehind: Starting after @ or .
-  ;; 2. Negative lookahead: Don't capture www as the domain
-  ;; 3. One domain segment
-  ;; 4. Positive lookahead:
-  ;;      Either:
-  ;; 5.     Short final segment (eg. .co.uk)
-  ;; 6.     Top-level domain
-  ;; 7.     Anchor to end
-  ;;      Or:
-  ;; 8.     Top-level domain
-  ;; 9.     Anchor to end
-  ;;1         2        3      (4   5            6      7|  8      9)
-  #"(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)")
-
-(def ^:private host->subdomain-regex
-  ;; This grabs the first segment that isn't "www", AND excludes the main domain name.
-  ;; See [[host->domain-regex]] for more details about how those are matched.
-  ;; Referencing the indexes below:
-  ;; 1.  Only at the start of the input
-  ;; 2.  Consume "www." if present
-  ;; 3.  Start capturing the subdomain we want
-  ;; 4.  Negative lookahead: That subdomain can't be "www"; we don't want to backtrack and find "www".
-  ;; 5.  Negative lookahead to make sure this isn't the proper domain:
-  ;; 6.      Main domain name
-  ;; 7.      Optional short tail (eg. co.uk)
-  ;; 8.      Top-level domain, ending the input
-  ;; 9.  Matching the actual subdomain
-  ;; 10. And its dot, which is outside the capture.
-  ;;12         34        5  6       7                8       9      10
-  #"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.")
-
 (defn- regex-available? [metadata-providerable]
   ((:features (lib.metadata/database metadata-providerable)) :regex))
 
@@ -99,13 +51,17 @@
     ;; If the target database doesn't support :regex feature, return nil.
     (not (regex-available? query))   nil
     (lib.types.isa/email? column)    {:display-name (i18n/tru "Extract domain")
-                                      :extractions  [{:key          :email-domain
-                                                      :display-name (i18n/tru "Domain")}]}
+                                      :extractions  [{:key          :domain
+                                                      :display-name (i18n/tru "Domain")}
+                                                     {:key          :host
+                                                      :display-name (i18n/tru "Host")}]}
     (lib.types.isa/URL? column)      {:display-name (i18n/tru "Extract domain, subdomain…")
                                       :extractions  [{:key          :domain
                                                       :display-name (i18n/tru "Domain")}
                                                      {:key          :subdomain
-                                                      :display-name (i18n/tru "Subdomain")}]}))
+                                                      :display-name (i18n/tru "Subdomain")}
+                                                     {:key          :host
+                                                      :display-name (i18n/tru "Host")}]}))
 
 (mu/defn column-extract-drill :- [:maybe ::lib.schema.drill-thru/drill-thru.column-extract]
   "Column clicks on temporal columns only.
@@ -143,15 +99,10 @@
     :month-of-year   (case-expression #(lib.expression/get-month column) tag 12)
     :quarter-of-year (case-expression #(lib.expression/get-quarter column) tag 4)
     :year            (lib.expression/get-year column)
-    ;; URLs
-    :domain          (-> column
-                         (lib.expression/regex-match-first url->host-regex)
-                         (lib.expression/regex-match-first host->domain-regex))
-    :subdomain       (-> column
-                         (lib.expression/regex-match-first url->host-regex)
-                         (lib.expression/regex-match-first host->subdomain-regex))
-    ;; Emails
-    :email-domain    (lib.expression/regex-match-first column email->domain-regex)))
+    ;; URLs and emails
+    :domain          (lib.expression/domain column)
+    :subdomain       (lib.expression/subdomain column)
+    :host            (lib.expression/host column)))
 
 (defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract
   [_query _stage-number {:keys [query stage-number column extractions]} & [tag]]

diff --git a/src/metabase/lib/expression.cljc b/src/metabase/lib/expression.cljc
@@ -245,7 +245,7 @@
 (lib.common/defop - [x y & more])
 (lib.common/defop * [x y & more])
 ;; Kondo gets confused
-#_{:clj-kondo/ignore [:unresolved-namespace]}
+#_{:clj-kondo/ignore [:unresolved-namespace :syntax]}
 (lib.common/defop / [x y & more])
 (lib.common/defop case [x y & more])
 (lib.common/defop coalesce [x y & more])
@@ -285,6 +285,9 @@
 (lib.common/defop rtrim [s])
 (lib.common/defop upper [s])
 (lib.common/defop lower [s])
+(lib.common/defop host [s])
+(lib.common/defop domain [s])
+(lib.common/defop subdomain [s])
 
 (mu/defn ^:private expression-metadata :- lib.metadata/ColumnMetadata
   [query                 :- ::lib.schema/query

diff --git a/src/metabase/lib/schema/expression/string.cljc b/src/metabase/lib/schema/expression/string.cljc
@@ -7,6 +7,10 @@
   (mbql-clause/define-tuple-mbql-clause op :- :type/Text
     [:schema [:ref ::expression/string]]))
 
+(doseq [op [:host :domain :subdomain]]
+  (mbql-clause/define-tuple-mbql-clause op :- :type/Text
+    [:schema [:ref ::expression/string]]))
+
 (mbql-clause/define-tuple-mbql-clause :length :- :type/Integer
   [:schema [:ref ::expression/string]])