Skip to content

Commit

Permalink
[CALCITE-4847] Parse SQL with BigQuery-style quoted identifiers and c…
Browse files Browse the repository at this point in the history
…haracter literals

In BigQuery, identifiers are quoted with backticks and an
included backtick is escaped with a backslash; character
literals are quoted with single-quotes or double-quotes and
an included quote is escaped with a backslash.

The parser enters this mode if you specify 'dialect=BIG_QUERY',
'lex=BIG_QUERY', or 'quoting=BACK_TICK_BACKSLASH' in the
connect string.

The connect string parameter 'quoting' previously allowed
values 'DOUBLE_QUOTE', 'BACK_TICK' and 'BRACKET', and now
also allows 'BACK_TICK_BACKSLASH'.
  • Loading branch information
julianhyde committed Oct 13, 2021
1 parent a250ab0 commit c2d0d66
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 36 deletions.
41 changes: 34 additions & 7 deletions core/src/main/codegen/templates/Parser.jj
Original file line number Diff line number Diff line change
Expand Up @@ -1473,9 +1473,13 @@ SqlNode SqlDescribe() :
|
// Use syntactic lookahead to determine whether a table name is coming.
// We do not allow SimpleIdentifier() because that includes <STATEMENT>.
LOOKAHEAD( <TABLE> | <IDENTIFIER>
| <HYPHENATED_IDENTIFIER> | <QUOTED_IDENTIFIER>
| <BACK_QUOTED_IDENTIFIER> | <BRACKET_QUOTED_IDENTIFIER> )
LOOKAHEAD( <TABLE>
| <IDENTIFIER>
| <HYPHENATED_IDENTIFIER>
| <QUOTED_IDENTIFIER>
| <BACK_QUOTED_IDENTIFIER>
| <BIG_QUERY_BACK_QUOTED_IDENTIFIER>
| <BRACKET_QUOTED_IDENTIFIER> )
(<TABLE>)?
table = CompoundIdentifier()
(
Expand Down Expand Up @@ -4966,6 +4970,12 @@ void IdentifierSegment(List<String> names, List<SqlParserPos> positions) :
quotedCasing);
pos = getPos().withQuoting(true);
}
|
<BIG_QUERY_BACK_QUOTED_IDENTIFIER> {
id = SqlParserUtil.stripQuotes(getToken(0).image, "`", "`", "\\`",
quotedCasing);
pos = getPos().withQuoting(true);
}
|
<BRACKET_QUOTED_IDENTIFIER> {
id = SqlParserUtil.stripQuotes(getToken(0).image, "[", "]", "]]",
Expand Down Expand Up @@ -8170,12 +8180,15 @@ Lexical states:

DEFAULT: Identifiers are quoted in brackets, e.g. [My Identifier]
DQID: Identifiers are double-quoted, e.g. "My Identifier"
BTID: Identifiers are enclosed in back-ticks, e.g. `My Identifier`
BQID: Identifiers are enclosed in back-ticks, e.g. `My Identifier`,
BTID: Identifiers are enclosed in back-ticks, escaped using back-ticks,
e.g. `My ``Quoted`` Identifier`
BQID: Identifiers are enclosed in back-ticks, escaped using backslash,
e.g. `My \`Quoted\` Identifier`,
and with the potential to shift into BQHID in contexts where table
names are expected, and thus allow hyphen-separated identifiers as
part of table names
BQHID: Identifiers are enclosed in back-ticks, e.g. `My Identifier`
BQHID: Identifiers are enclosed in back-ticks, escaped using backslash,
e.g. `My \`Quoted\` Identifier`
and unquoted identifiers may contain hyphens, e.g. foo-bar
IN_SINGLE_LINE_COMMENT:
IN_FORMAL_COMMENT:
Expand Down Expand Up @@ -8273,7 +8286,7 @@ MORE :
>
}

<BTID, BQID, BQHID> TOKEN :
<BTID> TOKEN :
{
< BACK_QUOTED_IDENTIFIER:
"`"
Expand All @@ -8286,6 +8299,20 @@ MORE :
>
}

<BQID, BQHID> TOKEN :
{
// BigQuery-style backtick-quoted identifier, escaped using backslash
< BIG_QUERY_BACK_QUOTED_IDENTIFIER:
"`"
(
(~["\\", "`"])
|
("\\" ~[])
)*
"`"
>
}

<BQHID> TOKEN :
{
// Per BigQuery: "Project IDs must contain 6-63 lowercase letters, digits,
Expand Down
12 changes: 7 additions & 5 deletions core/src/main/java/org/apache/calcite/config/Lex.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ public enum Lex {
/** Lexical policy similar to BigQuery.
* The case of identifiers is preserved whether or not they quoted;
* after which, identifiers are matched case-insensitively.
* Back-ticks allow identifiers to contain non-alphanumeric characters.
* Back-ticks allow identifiers to contain non-alphanumeric characters;
* a back-tick is escaped using a backslash.
* Character literals may be enclosed in single or double quotes. */
BIG_QUERY(Quoting.BACK_TICK, Casing.UNCHANGED, Casing.UNCHANGED, true,
CharLiteralStyle.BQ_SINGLE, CharLiteralStyle.BQ_DOUBLE),
BIG_QUERY(Quoting.BACK_TICK_BACKSLASH, Casing.UNCHANGED, Casing.UNCHANGED,
true, CharLiteralStyle.BQ_SINGLE, CharLiteralStyle.BQ_DOUBLE),

/** Lexical policy similar to Oracle. The case of identifiers enclosed in
* double-quotes is preserved; unquoted identifiers are converted to
Expand All @@ -47,7 +48,8 @@ public enum Lex {
* MySQL on Linux uses case-sensitive matching, like the Linux file system.)
* The case of identifiers is preserved whether or not they quoted;
* after which, identifiers are matched case-insensitively.
* Back-ticks allow identifiers to contain non-alphanumeric characters. */
* Back-ticks allow identifiers to contain non-alphanumeric characters;
* a back-tick is escaped using a back-tick. */
MYSQL(Quoting.BACK_TICK, Casing.UNCHANGED, Casing.UNCHANGED, false,
CharLiteralStyle.STANDARD),

Expand All @@ -71,7 +73,7 @@ public enum Lex {
* The case of identifiers is preserved whether or not they are quoted;
* after which, identifiers are matched case-sensitively.
* Unlike Java, back-ticks allow identifiers to contain non-alphanumeric
* characters. */
* characters; a back-tick is escaped using a back-tick. */
JAVA(Quoting.BACK_TICK, Casing.UNCHANGED, Casing.UNCHANGED, true,
CharLiteralStyle.STANDARD);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,8 @@ public static LexicalState forConfig(SqlParser.Config config) {
return DEFAULT;
case DOUBLE_QUOTE:
return DQID;
case BACK_TICK_BACKSLASH:
return BQID;
case BACK_TICK:
if (config.conformance().allowHyphenInUnquotedTableName()
&& config.charLiteralStyles().equals(
Expand Down
2 changes: 1 addition & 1 deletion site/_docs/adapter.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ as implemented by Avatica's
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#MATERIALIZATIONS_ENABLED">materializationsEnabled</a> | Whether Calcite should use materializations. Default false.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#MODEL">model</a> | URI of the JSON/YAML model file or inline like `inline:{...}` for JSON and `inline:...` for YAML.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#PARSER_FACTORY">parserFactory</a> | Parser factory. The name of a class that implements [<code>interface SqlParserImplFactory</code>]({{ site.apiRoot }}/org/apache/calcite/sql/parser/SqlParserImplFactory.html) and has a public default constructor or an `INSTANCE` constant.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#QUOTING">quoting</a> | How identifiers are quoted. Values are DOUBLE_QUOTE, BACK_QUOTE, BRACKET. If not specified, value from `lex` is used.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#QUOTING">quoting</a> | How identifiers are quoted. Values are DOUBLE_QUOTE, BACK_TICK, BACK_TICK_BACKSLASH, BRACKET. If not specified, value from `lex` is used.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#QUOTED_CASING">quotedCasing</a> | How identifiers are stored if they are quoted. Values are UNCHANGED, TO_UPPER, TO_LOWER. If not specified, value from `lex` is used.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#SCHEMA">schema</a> | Name of initial schema.
| <a href="{{ site.apiRoot }}/org/apache/calcite/config/CalciteConnectionProperty.html#SCHEMA_FACTORY">schemaFactory</a> | Schema factory. The name of a class that implements [<code>interface SchemaFactory</code>]({{ site.apiRoot }}/org/apache/calcite/schema/SchemaFactory.html) and has a public default constructor or an `INSTANCE` constant. Ignored if `model` is specified.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2280,15 +2280,37 @@ void checkPeriodPredicate(Checker checker) {
expr(" ^\"^a \"\" b!c\"")
.fails("(?s).*Encountered.*");

expr("^\"^x`y`z\"")
.fails("(?s).*Encountered.*");
expr("^\"^x`y`z\"").fails("(?s).*Encountered.*");
expr("`x``y``z`").ok("`x``y``z`");
expr("`x\\`^y^\\`z`").fails("(?s).*Encountered.*");

expr("myMap[field] + myArray[1 + 2]")
.ok("(`MYMAP`[`FIELD`] + `MYARRAY`[(1 + 2)])");

sql("VALUES a").node(isQuoted(0, false));
sql("VALUES `a`").node(isQuoted(0, true));
sql("VALUES `a``b`").node(isQuoted(0, true));
}

@Test void testBackTickBackslashIdentifier() {
quoting = Quoting.BACK_TICK_BACKSLASH;
expr("ab").ok("`AB`");
expr(" `a \" b!c`").ok("`a \" b!c`");
expr(" \"a \"^\" b!c\"^")
.fails("(?s).*Encountered.*");

// BACK_TICK_BACKSLASH identifiers implies
// BigQuery dialect, which implies double-quoted character literals.
expr("^\"^x`y`z\"").ok("'x`y`z'");
expr("`x`^`y`^`z`").fails("(?s).*Encountered.*");
expr("`x\\`y\\`z`").ok("`x``y``z`");

expr("myMap[field] + myArray[1 + 2]")
.ok("(`MYMAP`[`FIELD`] + `MYARRAY`[(1 + 2)])");

sql("VALUES a").node(isQuoted(0, false));
sql("VALUES `a`").node(isQuoted(0, true));
sql("VALUES `a\\`b`").node(isQuoted(0, true));
}

@Test void testBracketIdentifier() {
Expand Down Expand Up @@ -3722,6 +3744,7 @@ void checkPeriodPredicate(Checker checker) {
+ " <HYPHENATED_IDENTIFIER> \\.\\.\\.\n"
+ " <QUOTED_IDENTIFIER> \\.\\.\\.\n"
+ " <BACK_QUOTED_IDENTIFIER> \\.\\.\\.\n"
+ " <BIG_QUERY_BACK_QUOTED_IDENTIFIER> \\.\\.\\.\n"
+ " <BRACKET_QUOTED_IDENTIFIER> \\.\\.\\.\n"
+ " <UNICODE_QUOTED_IDENTIFIER> \\.\\.\\.\n"
+ " \"\\(\" \\.\\.\\.\n.*");
Expand Down Expand Up @@ -4493,28 +4516,28 @@ void checkPeriodPredicate(Checker checker) {
+ "FROM emp");

// MySQL uses single-quotes as escapes; BigQuery uses backslashes
sql("select 'Let''s call him \"Elvis\"!'")
sql("select 'Let''s call the dog \"Elvis\"!'")
.withDialect(MYSQL)
.node(isCharLiteral("Let's call him \"Elvis\"!"));
.node(isCharLiteral("Let's call the dog \"Elvis\"!"));

sql("select 'Let\\'\\'s call him \"Elvis\"!'")
sql("select 'Let\\'\\'s call the dog \"Elvis\"!'")
.withDialect(BIG_QUERY)
.node(isCharLiteral("Let''s call him \"Elvis\"!"));
.node(isCharLiteral("Let''s call the dog \"Elvis\"!"));

sql("select 'Let\\'s ^call^ him \"Elvis\"!'")
sql("select 'Let\\'s ^call^ the dog \"Elvis\"!'")
.withDialect(MYSQL)
.fails("(?s)Encountered \"call\" at .*")
.withDialect(BIG_QUERY)
.node(isCharLiteral("Let's call him \"Elvis\"!"));
.node(isCharLiteral("Let's call the dog \"Elvis\"!"));

// Oracle uses double-quotes as escapes in identifiers;
// BigQuery uses backslashes as escapes in double-quoted character literals.
sql("select \"Let's call him \\\"Elvis^\\^\"!\"")
sql("select \"Let's call the dog \\\"Elvis^\\^\"!\"")
.withDialect(ORACLE)
.fails("(?s)Lexical error at line 1, column 31\\. "
.fails("(?s)Lexical error at line 1, column 35\\. "
+ "Encountered: \"\\\\\\\\\" \\(92\\), after : \"\".*")
.withDialect(BIG_QUERY)
.node(isCharLiteral("Let's call him \"Elvis\"!"));
.node(isCharLiteral("Let's call the dog \"Elvis\"!"));
}

private static Matcher<SqlNode> isCharLiteral(String s) {
Expand Down Expand Up @@ -9331,35 +9354,35 @@ private static Consumer<List<? extends Throwable>> checkWarnings(

@Test void testConfigureFromDialect() {
// Calcite's default converts unquoted identifiers to upper case
sql("select unquotedColumn from \"doubleQuotedTable\"")
sql("select unquotedColumn from \"double\"\"QuotedTable\"")
.withDialect(CALCITE)
.ok("SELECT \"UNQUOTEDCOLUMN\"\n"
+ "FROM \"doubleQuotedTable\"");
+ "FROM \"double\"\"QuotedTable\"");
// MySQL leaves unquoted identifiers unchanged
sql("select unquotedColumn from `doubleQuotedTable`")
sql("select unquotedColumn from `double``QuotedTable`")
.withDialect(MYSQL)
.ok("SELECT `unquotedColumn`\n"
+ "FROM `doubleQuotedTable`");
+ "FROM `double``QuotedTable`");
// Oracle converts unquoted identifiers to upper case
sql("select unquotedColumn from \"doubleQuotedTable\"")
sql("select unquotedColumn from \"double\"\"QuotedTable\"")
.withDialect(ORACLE)
.ok("SELECT \"UNQUOTEDCOLUMN\"\n"
+ "FROM \"doubleQuotedTable\"");
+ "FROM \"double\"\"QuotedTable\"");
// PostgreSQL converts unquoted identifiers to lower case
sql("select unquotedColumn from \"doubleQuotedTable\"")
sql("select unquotedColumn from \"double\"\"QuotedTable\"")
.withDialect(POSTGRESQL)
.ok("SELECT \"unquotedcolumn\"\n"
+ "FROM \"doubleQuotedTable\"");
+ "FROM \"double\"\"QuotedTable\"");
// Redshift converts all identifiers to lower case
sql("select unquotedColumn from \"doubleQuotedTable\"")
sql("select unquotedColumn from \"double\"\"QuotedTable\"")
.withDialect(REDSHIFT)
.ok("SELECT \"unquotedcolumn\"\n"
+ "FROM \"doublequotedtable\"");
+ "FROM \"double\"\"quotedtable\"");
// BigQuery leaves quoted and unquoted identifiers unchanged
sql("select unquotedColumn from `doubleQuotedTable`")
sql("select unquotedColumn from `double\\`QuotedTable`")
.withDialect(BIG_QUERY)
.ok("SELECT unquotedColumn\n"
+ "FROM doubleQuotedTable");
+ "FROM `double\\`QuotedTable`");
}

/** Test case for
Expand Down

0 comments on commit c2d0d66

Please sign in to comment.