Skip to content

Commit

Permalink
[SPARK-36736][SQL] Support ILIKE (ALL | ANY | SOME) - case insensitiv…
Browse files Browse the repository at this point in the history
…e LIKE

### What changes were proposed in this pull request?
In the PR, I propose to support a case-insensitive variant of the `LIKE (ALL | ANY | SOME)` expression - `ILIKE`. In this way, Spark's users can match strings to single pattern in the case-insensitive manner. For example:
```sql
spark-sql> create table ilike_example(subject varchar(20));
spark-sql> insert into ilike_example values
         > ('jane doe'),
         > ('Jane Doe'),
         > ('JANE DOE'),
         > ('John Doe'),
         > ('John Smith');
spark-sql> select *
         > from ilike_example
         > where subject ilike any ('jane%', '%SMITH')
         > order by subject;
JANE DOE
Jane Doe
John Smith
jane doe
```

The syntax of `ILIKE` is similar to `LIKE`:
```
str NOT? ILIKE (ANY | SOME | ALL) (pattern+)
```

### Why are the changes needed?
1. To improve user experience with Spark SQL. No need to use `lower(col_name)` in where clauses.
2. To make migration from other popular DMBSs to Spark SQL easier. DBMSs below support `ilike` in SQL:
    - [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/ilike.html#ilike)
    - [PostgreSQL](https://www.postgresql.org/docs/12/functions-matching.html)
    - [CockroachDB](https://www.cockroachlabs.com/docs/stable/functions-and-operators.html)

### Does this PR introduce _any_ user-facing change?
No, it doesn't. The PR **extends** existing APIs.

### How was this patch tested?
1. By running of expression examples via:
```
$ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite"
```
2. Added new test to test parsing of `ILIKE`:
```
$ build/sbt "test:testOnly *.ExpressionParserSuite"
```
3. Via existing test suites:
```
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z ilike-any.sql"
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z ilike-all.sql"
```

Closes apache#33966 from MaxGekk/ilike-any.

Authored-by: Max Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
MaxGekk authored and cloud-fan committed Sep 13, 2021
1 parent e858cd5 commit bd62ad9
Show file tree
Hide file tree
Showing 9 changed files with 414 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ predicate
| NOT? kind=IN '(' expression (',' expression)* ')'
| NOT? kind=IN '(' query ')'
| NOT? kind=RLIKE pattern=valueExpression
| NOT? kind=LIKE quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')')
| NOT? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')')
| NOT? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=STRING)?
| IS NOT? kind=NULL
| IS NOT? kind=(TRUE | FALSE | UNKNOWN)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1557,7 +1557,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
* Add a predicate to the given expression. Supported expressions are:
* - (NOT) BETWEEN
* - (NOT) IN
* - (NOT) LIKE (ANY | SOME | ALL)
* - (NOT) (LIKE | ILIKE) (ANY | SOME | ALL)
* - (NOT) RLIKE
* - IS (NOT) NULL.
* - IS (NOT) (TRUE | FALSE | UNKNOWN)
Expand All @@ -1575,6 +1575,20 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
case other => Seq(other)
}

def lowerLikeArgsIfNeeded(
expr: Expression,
patterns: Seq[UTF8String]): (Expression, Seq[UTF8String]) = ctx.kind.getType match {
// scalastyle:off caselocale
case SqlBaseParser.ILIKE => (Lower(expr), patterns.map(_.toLowerCase))
// scalastyle:on caselocale
case _ => (expr, patterns)
}

def getLike(expr: Expression, pattern: Expression): Expression = ctx.kind.getType match {
case SqlBaseParser.ILIKE => new ILike(expr, pattern)
case _ => new Like(expr, pattern)
}

// Create the predicate.
ctx.kind.getType match {
case SqlBaseParser.BETWEEN =>
Expand All @@ -1595,13 +1609,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
// If there are many pattern expressions, will throw StackOverflowError.
// So we use LikeAny or NotLikeAny instead.
val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String])
val (expr, pat) = lowerLikeArgsIfNeeded(e, patterns)
ctx.NOT match {
case null => LikeAny(e, patterns)
case _ => NotLikeAny(e, patterns)
case null => LikeAny(expr, pat)
case _ => NotLikeAny(expr, pat)
}
} else {
ctx.expression.asScala.map(expression)
.map(p => invertIfNotDefined(new Like(e, p))).toSeq.reduceLeft(Or)
.map(p => invertIfNotDefined(getLike(e, p))).toSeq.reduceLeft(Or)
}
case Some(SqlBaseParser.ALL) =>
validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx)
Expand All @@ -1610,13 +1625,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
// If there are many pattern expressions, will throw StackOverflowError.
// So we use LikeAll or NotLikeAll instead.
val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String])
val (expr, pat) = lowerLikeArgsIfNeeded(e, patterns)
ctx.NOT match {
case null => LikeAll(e, patterns)
case _ => NotLikeAll(e, patterns)
case null => LikeAll(expr, pat)
case _ => NotLikeAll(expr, pat)
}
} else {
ctx.expression.asScala.map(expression)
.map(p => invertIfNotDefined(new Like(e, p))).toSeq.reduceLeft(And)
.map(p => invertIfNotDefined(getLike(e, p))).toSeq.reduceLeft(And)
}
case _ =>
val escapeChar = Option(ctx.escapeChar).map(string).map { str =>
Expand All @@ -1625,9 +1641,10 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
}
str.charAt(0)
}.getOrElse('\\')
val likeExpr = if (ctx.kind.getType == SqlBaseParser.ILIKE) {
new ILike(e, expression(ctx.pattern), escapeChar)
} else Like(e, expression(ctx.pattern), escapeChar)
val likeExpr = ctx.kind.getType match {
case SqlBaseParser.ILIKE => new ILike(e, expression(ctx.pattern), escapeChar)
case _ => Like(e, expression(ctx.pattern), escapeChar)
}
invertIfNotDefined(likeExpr)
}
case SqlBaseParser.RLIKE =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -938,4 +938,19 @@ class ExpressionParserSuite extends AnalysisTest {
assertEqual("current_timestamp", UnresolvedAttribute.quoted("current_timestamp"))
}
}

test("SPARK-36736: (NOT) ILIKE (ANY | SOME | ALL) expressions") {
Seq("any", "some").foreach { quantifier =>
assertEqual(s"a ilike $quantifier ('FOO%', 'b%')", lower($"a") likeAny("foo%", "b%"))
assertEqual(s"a not ilike $quantifier ('foo%', 'B%')", lower($"a") notLikeAny("foo%", "b%"))
assertEqual(s"not (a ilike $quantifier ('FOO%', 'B%'))", !(lower($"a") likeAny("foo%", "b%")))
}
assertEqual("a ilike all ('Foo%', 'b%')", lower($"a") likeAll("foo%", "b%"))
assertEqual("a not ilike all ('foo%', 'B%')", lower($"a") notLikeAll("foo%", "b%"))
assertEqual("not (a ilike all ('foO%', 'b%'))", !(lower($"a") likeAll("foo%", "b%")))

Seq("any", "some", "all").foreach { quantifier =>
intercept(s"a ilike $quantifier()", "Expected something between '(' and ')'")
}
}
}
41 changes: 41 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/ilike-all.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- test cases for ilike all

CREATE OR REPLACE TEMPORARY VIEW ilike_all_table AS SELECT * FROM (VALUES
('gOOgle', '%oo%'),
('facebook', '%OO%'),
('liNkedin', '%In'))
as t1(company, pat);

SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oO%', '%Go%');

SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('microsoft', '%yoo%');

SELECT
company,
CASE
WHEN company ILIKE ALL ('%oo%', '%GO%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ALL ('%OO%', 'go%') OR company ILIKE ALL ('%IN', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_all_table ;

-- Mix test with constant pattern and column value
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oo%', pat);

-- not ilike all test
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', '%In', 'Fa%');
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('microsoft', '%yoo%');
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', 'fA%');
SELECT company FROM ilike_all_table WHERE NOT company ILIKE ALL ('%oO%', 'fa%');

-- null test
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%OO%', NULL);
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%Oo%', NULL);
SELECT company FROM ilike_all_table WHERE company ILIKE ALL (NULL, NULL);
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL (NULL, NULL);

-- negative case
SELECT company FROM ilike_any_table WHERE company ILIKE ALL ();
41 changes: 41 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/ilike-any.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- test cases for ilike any

CREATE OR REPLACE TEMPORARY VIEW ilike_any_table AS SELECT * FROM (VALUES
('Google', '%Oo%'),
('FaceBook', '%oO%'),
('linkedIn', '%IN'))
as t1(company, pat);

SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%oo%', '%IN', 'fA%');

SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('microsoft', '%yoo%');

select
company,
CASE
WHEN company ILIKE ANY ('%oO%', '%IN', 'Fa%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ANY ('%OO%', 'fa%') OR company ILIKE ANY ('%in', 'MS%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_any_table;

-- Mix test with constant pattern and column value
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%zZ%', pat);

-- not ilike any test
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oO%', '%iN', 'fa%');
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('microsoft', '%yOo%');
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oo%', 'Fa%');
SELECT company FROM ilike_any_table WHERE NOT company ILIKE ANY ('%OO%', 'fa%');

-- null test
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%oO%', NULL);
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oo%', NULL);
SELECT company FROM ilike_any_table WHERE company ILIKE ANY (NULL, NULL);
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY (NULL, NULL);

-- negative case
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ();
2 changes: 1 addition & 1 deletion sql/core/src/test/resources/sql-tests/inputs/like-all.sql
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ SELECT company FROM like_all_table WHERE company LIKE ALL (NULL, NULL);
SELECT company FROM like_all_table WHERE company NOT LIKE ALL (NULL, NULL);

-- negative case
SELECT company FROM like_any_table WHERE company LIKE ALL ();
SELECT company FROM like_all_table WHERE company LIKE ALL ();
140 changes: 140 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/ilike-all.sql.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 14


-- !query
CREATE OR REPLACE TEMPORARY VIEW ilike_all_table AS SELECT * FROM (VALUES
('gOOgle', '%oo%'),
('facebook', '%OO%'),
('liNkedin', '%In'))
as t1(company, pat)
-- !query schema
struct<>
-- !query output



-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oO%', '%Go%')
-- !query schema
struct<company:string>
-- !query output
gOOgle


-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output



-- !query
SELECT
company,
CASE
WHEN company ILIKE ALL ('%oo%', '%GO%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ALL ('%OO%', 'go%') OR company ILIKE ALL ('%IN', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_all_table
-- !query schema
struct<company:string,is_available:string,mix:string>
-- !query output
facebook N N
gOOgle Y Y
liNkedin N N


-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oo%', pat)
-- !query schema
struct<company:string>
-- !query output
facebook
gOOgle


-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', '%In', 'Fa%')
-- !query schema
struct<company:string>
-- !query output



-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
facebook
gOOgle
liNkedin


-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', 'fA%')
-- !query schema
struct<company:string>
-- !query output
liNkedin


-- !query
SELECT company FROM ilike_all_table WHERE NOT company ILIKE ALL ('%oO%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
gOOgle
liNkedin


-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%OO%', NULL)
-- !query schema
struct<company:string>
-- !query output



-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%Oo%', NULL)
-- !query schema
struct<company:string>
-- !query output



-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output



-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output



-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ALL ()
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.parser.ParseException

Expected something between '(' and ')'.(line 1, pos 50)

== SQL ==
SELECT company FROM ilike_any_table WHERE company ILIKE ALL ()
--------------------------------------------------^^^
Loading

0 comments on commit bd62ad9

Please sign in to comment.