[CALCITE-4847] Parse SQL with BigQuery-style quoted identifiers and c…

…haracter literals In BigQuery, identifiers are quoted with backticks and an included backtick is escaped with a backslash; character literals are quoted with single-quotes or double-quotes and an included quote is escaped with a backslash. The parser enters this mode if you specify 'dialect=BIG_QUERY', 'lex=BIG_QUERY', or 'quoting=BACK_TICK_BACKSLASH' in the connect string. The connect string parameter 'quoting' previously allowed values 'DOUBLE_QUOTE', 'BACK_TICK' and 'BRACKET', and now also allows 'BACK_TICK_BACKSLASH'.
w31xu · Oct 13, 2021 · c2d0d66 · c2d0d66
1 parent a250ab0
commit c2d0d66
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 36 deletions.
diff --git a/core/src/main/codegen/templates/Parser.jj b/core/src/main/codegen/templates/Parser.jj
@@ -1473,9 +1473,13 @@ SqlNode SqlDescribe() :
     |
         // Use syntactic lookahead to determine whether a table name is coming.
         // We do not allow SimpleIdentifier() because that includes <STATEMENT>.
-        LOOKAHEAD( <TABLE> | <IDENTIFIER>
-           | <HYPHENATED_IDENTIFIER> | <QUOTED_IDENTIFIER>
-           | <BACK_QUOTED_IDENTIFIER> | <BRACKET_QUOTED_IDENTIFIER> )
+        LOOKAHEAD( <TABLE>
+           | <IDENTIFIER>
+           | <HYPHENATED_IDENTIFIER>
+           | <QUOTED_IDENTIFIER>
+           | <BACK_QUOTED_IDENTIFIER>
+           | <BIG_QUERY_BACK_QUOTED_IDENTIFIER>
+           | <BRACKET_QUOTED_IDENTIFIER> )
         (<TABLE>)?
         table = CompoundIdentifier()
         (
@@ -4966,6 +4970,12 @@ void IdentifierSegment(List<String> names, List<SqlParserPos> positions) :
                 quotedCasing);
             pos = getPos().withQuoting(true);
         }
+    |
+        <BIG_QUERY_BACK_QUOTED_IDENTIFIER> {
+            id = SqlParserUtil.stripQuotes(getToken(0).image, "`", "`", "\\`",
+                quotedCasing);
+            pos = getPos().withQuoting(true);
+        }
     |
         <BRACKET_QUOTED_IDENTIFIER> {
             id = SqlParserUtil.stripQuotes(getToken(0).image, "[", "]", "]]",
@@ -8170,12 +8180,15 @@ Lexical states:
 
 DEFAULT: Identifiers are quoted in brackets, e.g. [My Identifier]
 DQID:    Identifiers are double-quoted, e.g. "My Identifier"
-BTID:    Identifiers are enclosed in back-ticks, e.g. `My Identifier`
-BQID:    Identifiers are enclosed in back-ticks, e.g. `My Identifier`,
+BTID:    Identifiers are enclosed in back-ticks, escaped using back-ticks,
+         e.g. `My ``Quoted`` Identifier`
+BQID:    Identifiers are enclosed in back-ticks, escaped using backslash,
+         e.g. `My \`Quoted\` Identifier`,
          and with the potential to shift into BQHID in contexts where table
          names are expected, and thus allow hyphen-separated identifiers as
          part of table names
-BQHID:   Identifiers are enclosed in back-ticks, e.g. `My Identifier`
+BQHID:   Identifiers are enclosed in back-ticks, escaped using backslash,
+         e.g. `My \`Quoted\` Identifier`
          and unquoted identifiers may contain hyphens, e.g. foo-bar
 IN_SINGLE_LINE_COMMENT:
 IN_FORMAL_COMMENT:
@@ -8273,7 +8286,7 @@ MORE :
     >
 }
 
-<BTID, BQID, BQHID> TOKEN :
+<BTID> TOKEN :
 {
     < BACK_QUOTED_IDENTIFIER:
     "`"
@@ -8286,6 +8299,20 @@ MORE :
     >
 }
 
+<BQID, BQHID> TOKEN :
+{
+    // BigQuery-style backtick-quoted identifier, escaped using backslash
+    < BIG_QUERY_BACK_QUOTED_IDENTIFIER:
+    "`"
+    (
+        (~["\\", "`"])
+    |
+        ("\\" ~[])
+    )*
+    "`"
+    >
+}
+
 <BQHID> TOKEN :
 {
     // Per BigQuery: "Project IDs must contain 6-63 lowercase letters, digits,

diff --git a/core/src/main/java/org/apache/calcite/config/Lex.java b/core/src/main/java/org/apache/calcite/config/Lex.java
@@ -32,10 +32,11 @@ public enum Lex {
   /** Lexical policy similar to BigQuery.
    * The case of identifiers is preserved whether or not they quoted;
    * after which, identifiers are matched case-insensitively.
-   * Back-ticks allow identifiers to contain non-alphanumeric characters.
+   * Back-ticks allow identifiers to contain non-alphanumeric characters;
+   * a back-tick is escaped using a backslash.
    * Character literals may be enclosed in single or double quotes. */
-  BIG_QUERY(Quoting.BACK_TICK, Casing.UNCHANGED, Casing.UNCHANGED, true,
-      CharLiteralStyle.BQ_SINGLE, CharLiteralStyle.BQ_DOUBLE),
+  BIG_QUERY(Quoting.BACK_TICK_BACKSLASH, Casing.UNCHANGED, Casing.UNCHANGED,
+      true, CharLiteralStyle.BQ_SINGLE, CharLiteralStyle.BQ_DOUBLE),
 
   /** Lexical policy similar to Oracle. The case of identifiers enclosed in
    * double-quotes is preserved; unquoted identifiers are converted to
@@ -47,7 +48,8 @@ public enum Lex {
    * MySQL on Linux uses case-sensitive matching, like the Linux file system.)
    * The case of identifiers is preserved whether or not they quoted;
    * after which, identifiers are matched case-insensitively.
-   * Back-ticks allow identifiers to contain non-alphanumeric characters. */
+   * Back-ticks allow identifiers to contain non-alphanumeric characters;
+   * a back-tick is escaped using a back-tick. */
   MYSQL(Quoting.BACK_TICK, Casing.UNCHANGED, Casing.UNCHANGED, false,
       CharLiteralStyle.STANDARD),
 
@@ -71,7 +73,7 @@ public enum Lex {
    * The case of identifiers is preserved whether or not they are quoted;
    * after which, identifiers are matched case-sensitively.
    * Unlike Java, back-ticks allow identifiers to contain non-alphanumeric
-   * characters. */
+   * characters; a back-tick is escaped using a back-tick. */
   JAVA(Quoting.BACK_TICK, Casing.UNCHANGED, Casing.UNCHANGED, true,
       CharLiteralStyle.STANDARD);
 

diff --git a/core/src/main/java/org/apache/calcite/sql/parser/SqlAbstractParserImpl.java b/core/src/main/java/org/apache/calcite/sql/parser/SqlAbstractParserImpl.java
@@ -548,6 +548,8 @@ public static LexicalState forConfig(SqlParser.Config config) {
         return DEFAULT;
       case DOUBLE_QUOTE:
         return DQID;
+      case BACK_TICK_BACKSLASH:
+        return BQID;
       case BACK_TICK:
         if (config.conformance().allowHyphenInUnquotedTableName()
             && config.charLiteralStyles().equals(

diff --git a/site/_docs/adapter.md b/site/_docs/adapter.md
@@ -95,7 +95,7 @@ as implemented by Avatica's
 | <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#MATERIALIZATIONS_ENABLED">materializationsEnabled</a> | Whether Calcite should use materializations. Default false.
 | <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#MODEL">model</a> | URI of the JSON/YAML model file or inline like `inline:{...}` for JSON and `inline:...` for YAML.
 | <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#PARSER_FACTORY">parserFactory</a> | Parser factory. The name of a class that implements [<code>interface SqlParserImplFactory</code>]({{ site.apiRoot }}/org/apache/calcite/sql/parser/SqlParserImplFactory.html) and has a public default constructor or an `INSTANCE` constant.
-| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#QUOTING">quoting</a> | How identifiers are quoted. Values are DOUBLE_QUOTE, BACK_QUOTE, BRACKET. If not specified, value from `lex` is used.
+| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#QUOTING">quoting</a> | How identifiers are quoted. Values are DOUBLE_QUOTE, BACK_TICK, BACK_TICK_BACKSLASH, BRACKET. If not specified, value from `lex` is used.
 | <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#QUOTED_CASING">quotedCasing</a> | How identifiers are stored if they are quoted. Values are UNCHANGED, TO_UPPER, TO_LOWER. If not specified, value from `lex` is used.
 | <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#SCHEMA">schema</a> | Name of initial schema.
 | <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#SCHEMA_FACTORY">schemaFactory</a> | Schema factory. The name of a class that implements [<code>interface SchemaFactory</code>]({{ site.apiRoot }}/org/apache/calcite/schema/SchemaFactory.html) and has a public default constructor or an `INSTANCE` constant. Ignored if `model` is specified.

diff --git a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
@@ -2280,15 +2280,37 @@ void checkPeriodPredicate(Checker checker) {
     expr("     ^\"^a  \"\" b!c\"")
         .fails("(?s).*Encountered.*");
 
-    expr("^\"^x`y`z\"")
-        .fails("(?s).*Encountered.*");
+    expr("^\"^x`y`z\"").fails("(?s).*Encountered.*");
     expr("`x``y``z`").ok("`x``y``z`");
+    expr("`x\\`^y^\\`z`").fails("(?s).*Encountered.*");
+
+    expr("myMap[field] + myArray[1 + 2]")
+        .ok("(`MYMAP`[`FIELD`] + `MYARRAY`[(1 + 2)])");
+
+    sql("VALUES a").node(isQuoted(0, false));
+    sql("VALUES `a`").node(isQuoted(0, true));
+    sql("VALUES `a``b`").node(isQuoted(0, true));
+  }
+
+  @Test void testBackTickBackslashIdentifier() {
+    quoting = Quoting.BACK_TICK_BACKSLASH;
+    expr("ab").ok("`AB`");
+    expr("     `a  \" b!c`").ok("`a  \" b!c`");
+    expr("     \"a  \"^\" b!c\"^")
+        .fails("(?s).*Encountered.*");
+
+    // BACK_TICK_BACKSLASH identifiers implies
+    // BigQuery dialect, which implies double-quoted character literals.
+    expr("^\"^x`y`z\"").ok("'x`y`z'");
+    expr("`x`^`y`^`z`").fails("(?s).*Encountered.*");
+    expr("`x\\`y\\`z`").ok("`x``y``z`");
 
     expr("myMap[field] + myArray[1 + 2]")
         .ok("(`MYMAP`[`FIELD`] + `MYARRAY`[(1 + 2)])");
 
     sql("VALUES a").node(isQuoted(0, false));
     sql("VALUES `a`").node(isQuoted(0, true));
+    sql("VALUES `a\\`b`").node(isQuoted(0, true));
   }
 
   @Test void testBracketIdentifier() {
@@ -3722,6 +3744,7 @@ void checkPeriodPredicate(Checker checker) {
             + "    <HYPHENATED_IDENTIFIER> \\.\\.\\.\n"
             + "    <QUOTED_IDENTIFIER> \\.\\.\\.\n"
             + "    <BACK_QUOTED_IDENTIFIER> \\.\\.\\.\n"
+            + "    <BIG_QUERY_BACK_QUOTED_IDENTIFIER> \\.\\.\\.\n"
             + "    <BRACKET_QUOTED_IDENTIFIER> \\.\\.\\.\n"
             + "    <UNICODE_QUOTED_IDENTIFIER> \\.\\.\\.\n"
             + "    \"\\(\" \\.\\.\\.\n.*");
@@ -4493,28 +4516,28 @@ void checkPeriodPredicate(Checker checker) {
             + "FROM emp");
 
     // MySQL uses single-quotes as escapes; BigQuery uses backslashes
-    sql("select 'Let''s call him \"Elvis\"!'")
+    sql("select 'Let''s call the dog \"Elvis\"!'")
         .withDialect(MYSQL)
-        .node(isCharLiteral("Let's call him \"Elvis\"!"));
+        .node(isCharLiteral("Let's call the dog \"Elvis\"!"));
 
-    sql("select 'Let\\'\\'s call him \"Elvis\"!'")
+    sql("select 'Let\\'\\'s call the dog \"Elvis\"!'")
         .withDialect(BIG_QUERY)
-        .node(isCharLiteral("Let''s call him \"Elvis\"!"));
+        .node(isCharLiteral("Let''s call the dog \"Elvis\"!"));
 
-    sql("select 'Let\\'s ^call^ him \"Elvis\"!'")
+    sql("select 'Let\\'s ^call^ the dog \"Elvis\"!'")
         .withDialect(MYSQL)
         .fails("(?s)Encountered \"call\" at .*")
         .withDialect(BIG_QUERY)
-        .node(isCharLiteral("Let's call him \"Elvis\"!"));
+        .node(isCharLiteral("Let's call the dog \"Elvis\"!"));
 
     // Oracle uses double-quotes as escapes in identifiers;
     // BigQuery uses backslashes as escapes in double-quoted character literals.
-    sql("select \"Let's call him \\\"Elvis^\\^\"!\"")
+    sql("select \"Let's call the dog \\\"Elvis^\\^\"!\"")
         .withDialect(ORACLE)
-        .fails("(?s)Lexical error at line 1, column 31\\.  "
+        .fails("(?s)Lexical error at line 1, column 35\\.  "
             + "Encountered: \"\\\\\\\\\" \\(92\\), after : \"\".*")
         .withDialect(BIG_QUERY)
-        .node(isCharLiteral("Let's call him \"Elvis\"!"));
+        .node(isCharLiteral("Let's call the dog \"Elvis\"!"));
   }
 
   private static Matcher<SqlNode> isCharLiteral(String s) {
@@ -9331,35 +9354,35 @@ private static Consumer<List<? extends Throwable>> checkWarnings(
 
   @Test void testConfigureFromDialect() {
     // Calcite's default converts unquoted identifiers to upper case
-    sql("select unquotedColumn from \"doubleQuotedTable\"")
+    sql("select unquotedColumn from \"double\"\"QuotedTable\"")
         .withDialect(CALCITE)
         .ok("SELECT \"UNQUOTEDCOLUMN\"\n"
-            + "FROM \"doubleQuotedTable\"");
+            + "FROM \"double\"\"QuotedTable\"");
     // MySQL leaves unquoted identifiers unchanged
-    sql("select unquotedColumn from `doubleQuotedTable`")
+    sql("select unquotedColumn from `double``QuotedTable`")
         .withDialect(MYSQL)
         .ok("SELECT `unquotedColumn`\n"
-            + "FROM `doubleQuotedTable`");
+            + "FROM `double``QuotedTable`");
     // Oracle converts unquoted identifiers to upper case
-    sql("select unquotedColumn from \"doubleQuotedTable\"")
+    sql("select unquotedColumn from \"double\"\"QuotedTable\"")
         .withDialect(ORACLE)
         .ok("SELECT \"UNQUOTEDCOLUMN\"\n"
-            + "FROM \"doubleQuotedTable\"");
+            + "FROM \"double\"\"QuotedTable\"");
     // PostgreSQL converts unquoted identifiers to lower case
-    sql("select unquotedColumn from \"doubleQuotedTable\"")
+    sql("select unquotedColumn from \"double\"\"QuotedTable\"")
         .withDialect(POSTGRESQL)
         .ok("SELECT \"unquotedcolumn\"\n"
-            + "FROM \"doubleQuotedTable\"");
+            + "FROM \"double\"\"QuotedTable\"");
     // Redshift converts all identifiers to lower case
-    sql("select unquotedColumn from \"doubleQuotedTable\"")
+    sql("select unquotedColumn from \"double\"\"QuotedTable\"")
         .withDialect(REDSHIFT)
         .ok("SELECT \"unquotedcolumn\"\n"
-            + "FROM \"doublequotedtable\"");
+            + "FROM \"double\"\"quotedtable\"");
     // BigQuery leaves quoted and unquoted identifiers unchanged
-    sql("select unquotedColumn from `doubleQuotedTable`")
+    sql("select unquotedColumn from `double\\`QuotedTable`")
         .withDialect(BIG_QUERY)
         .ok("SELECT unquotedColumn\n"
-            + "FROM doubleQuotedTable");
+            + "FROM `double\\`QuotedTable`");
   }
 
   /** Test case for