Skip to content

Commit

Permalink
[SPARK-25660][SQL] Fix for the backward slash as CSV fields delimiter
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

The PR addresses the exception raised on accessing chars out of delimiter string. In particular, the backward slash `\` as the CSV fields delimiter causes the following exception on reading `abc\1`:
```Scala
String index out of range: 1
java.lang.StringIndexOutOfBoundsException: String index out of range: 1
	at java.lang.String.charAt(String.java:658)
```
because `str.charAt(1)` tries to access a char out of `str` in `CSVUtils.toChar`

## How was this patch tested?

Added tests for empty string and string containing the backward slash to `CSVUtilsSuite`. Besides of that I added an end-to-end test to check how the backward slash is handled in reading CSV string with it.

Closes apache#22654 from MaxGekk/csv-slash-delim.

Authored-by: Maxim Gekk <[email protected]>
Signed-off-by: gatorsmile <[email protected]>
  • Loading branch information
MaxGekk authored and gatorsmile committed Oct 12, 2018
1 parent 8e039a7 commit c7eadb5
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,23 +139,25 @@ object CSVUtils {
*/
@throws[IllegalArgumentException]
def toChar(str: String): Char = {
if (str.charAt(0) == '\\') {
str.charAt(1)
match {
case 't' => '\t'
case 'r' => '\r'
case 'b' => '\b'
case 'f' => '\f'
case '\"' => '\"' // In case user changes quote char and uses \" as delimiter in options
case '\'' => '\''
case 'u' if str == """\u0000""" => '\u0000'
case _ =>
throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str")
}
} else if (str.length == 1) {
str.charAt(0)
} else {
throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str")
(str: Seq[Char]) match {
case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string")
case Seq('\\') => throw new IllegalArgumentException("Single backslash is prohibited." +
" It has special meaning as beginning of an escape sequence." +
" To get the backslash character, pass a string with two backslashes as the delimiter.")
case Seq(c) => c
case Seq('\\', 't') => '\t'
case Seq('\\', 'r') => '\r'
case Seq('\\', 'b') => '\b'
case Seq('\\', 'f') => '\f'
// In case user changes quote char and uses \" as delimiter in options
case Seq('\\', '\"') => '\"'
case Seq('\\', '\'') => '\''
case Seq('\\', '\\') => '\\'
case _ if str == """\u0000""" => '\u0000'
case Seq('\\', _) =>
throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str")
case _ =>
throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1826,4 +1826,14 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
val df = spark.read.option("enforceSchema", false).csv(input)
checkAnswer(df, Row("1", "2"))
}

test("using the backward slash as the delimiter") {
val input = Seq("""abc\1""").toDS()
val delimiter = """\\"""
checkAnswer(spark.read.option("delimiter", delimiter).csv(input), Row("abc", "1"))
checkAnswer(spark.read.option("inferSchema", true).option("delimiter", delimiter).csv(input),
Row("abc", 1))
val schema = new StructType().add("a", StringType).add("b", IntegerType)
checkAnswer(spark.read.schema(schema).option("delimiter", delimiter).csv(input), Row("abc", 1))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class CSVUtilsSuite extends SparkFunSuite {
assert(CSVUtils.toChar("""\"""") === '\"')
assert(CSVUtils.toChar("""\'""") === '\'')
assert(CSVUtils.toChar("""\u0000""") === '\u0000')
assert(CSVUtils.toChar("""\\""") === '\\')
}

test("Does not accept delimiter larger than one character") {
Expand All @@ -44,4 +45,17 @@ class CSVUtilsSuite extends SparkFunSuite {
assert(exception.getMessage.contains("Unsupported special character for delimiter"))
}

test("string with one backward slash is prohibited") {
val exception = intercept[IllegalArgumentException]{
CSVUtils.toChar("""\""")
}
assert(exception.getMessage.contains("Single backslash is prohibited"))
}

test("output proper error message for empty string") {
val exception = intercept[IllegalArgumentException]{
CSVUtils.toChar("")
}
assert(exception.getMessage.contains("Delimiter cannot be empty string"))
}
}

0 comments on commit c7eadb5

Please sign in to comment.