Skip to content

Commit

Permalink
[SPARK-16409][SQL] regexp_extract with optional groups causes NPE
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

regexp_extract actually returns null when it shouldn't when a regex matches but the requested optional group did not. This makes it return an empty string, as apparently designed.

## How was this patch tested?

Additional unit test

Author: Sean Owen <[email protected]>

Closes apache#14504 from srowen/SPARK-16409.
  • Loading branch information
srowen committed Aug 7, 2016
1 parent bdfab9f commit 8d87252
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
3 changes: 3 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,9 @@ def regexp_extract(str, pattern, idx):
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d=u'100')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
[Row(d=u'')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,12 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
val m = pattern.matcher(s.toString)
if (m.find) {
val mr: MatchResult = m.toMatchResult
UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
val group = mr.group(r.asInstanceOf[Int])
if (group == null) { // Pattern matched, but not optional group
UTF8String.EMPTY_UTF8
} else {
UTF8String.fromString(group)
}
} else {
UTF8String.EMPTY_UTF8
}
Expand Down Expand Up @@ -367,7 +372,11 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
${termPattern}.matcher($subject.toString());
if (${matcher}.find()) {
java.util.regex.MatchResult ${matchResult} = ${matcher}.toMatchResult();
${ev.value} = UTF8String.fromString(${matchResult}.group($idx));
if (${matchResult}.group($idx) == null) {
${ev.value} = UTF8String.EMPTY_UTF8;
} else {
${ev.value} = UTF8String.fromString(${matchResult}.group($idx));
}
$setEvNotNull
} else {
${ev.value} = UTF8String.EMPTY_UTF8;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
Row("300", "100") :: Row("400", "100") :: Row("400-400", "100") :: Nil)
}

test("non-matching optional group") {
val df = Seq("aaaac").toDF("s")
checkAnswer(
df.select(regexp_extract($"s", "(a+)(b)?(c)", 2)),
Row("")
)
}

test("string ascii function") {
val df = Seq(("abc", "")).toDF("a", "b")
checkAnswer(
Expand Down

0 comments on commit 8d87252

Please sign in to comment.