Skip to content

Commit

Permalink
[SPARK-9650][SQL] Fix quoting behavior on interpolated column names
Browse files Browse the repository at this point in the history
Make sure that `$"column"` is consistent with other methods with respect to backticks.  Adds a bunch of tests for various ways of constructing columns.

Author: Michael Armbrust <[email protected]>

Closes apache#7969 from marmbrus/namesWithDots and squashes the following commits:

53ef3d7 [Michael Armbrust] [SPARK-9650][SQL] Fix quoting behavior on interpolated column names
2bf7a92 [Michael Armbrust] WIP

(cherry picked from commit 0867b23)
Signed-off-by: Reynold Xin <[email protected]>
  • Loading branch information
marmbrus authored and rxin committed Aug 7, 2015
1 parent b4feccf commit 9be9d38
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.errors
import org.apache.spark.sql.catalyst.expressions._
Expand Down Expand Up @@ -69,8 +70,64 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Un
}

object UnresolvedAttribute {
/**
* Creates an [[UnresolvedAttribute]], parsing segments separated by dots ('.').
*/
def apply(name: String): UnresolvedAttribute = new UnresolvedAttribute(name.split("\\."))

/**
* Creates an [[UnresolvedAttribute]], from a single quoted string (for example using backticks in
* HiveQL. Since the string is consider quoted, no processing is done on the name.
*/
def quoted(name: String): UnresolvedAttribute = new UnresolvedAttribute(Seq(name))

/**
* Creates an [[UnresolvedAttribute]] from a string in an embedded language. In this case
* we treat it as a quoted identifier, except for '.', which must be further quoted using
* backticks if it is part of a column name.
*/
def quotedString(name: String): UnresolvedAttribute =
new UnresolvedAttribute(parseAttributeName(name))

/**
* Used to split attribute name by dot with backticks rule.
* Backticks must appear in pairs, and the quoted string must be a complete name part,
* which means `ab..c`e.f is not allowed.
* Escape character is not supported now, so we can't use backtick inside name part.
*/
def parseAttributeName(name: String): Seq[String] = {
def e = new AnalysisException(s"syntax error in attribute name: $name")
val nameParts = scala.collection.mutable.ArrayBuffer.empty[String]
val tmp = scala.collection.mutable.ArrayBuffer.empty[Char]
var inBacktick = false
var i = 0
while (i < name.length) {
val char = name(i)
if (inBacktick) {
if (char == '`') {
inBacktick = false
if (i + 1 < name.length && name(i + 1) != '.') throw e
} else {
tmp += char
}
} else {
if (char == '`') {
if (tmp.nonEmpty) throw e
inBacktick = true
} else if (char == '.') {
if (name(i - 1) == '.' || i == name.length - 1) throw e
nameParts += tmp.mkString
tmp.clear()
} else {
tmp += char
}
}
i += 1
}
if (inBacktick) throw e
nameParts += tmp.mkString
nameParts.toSeq
}
}

case class UnresolvedFunction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,47 +179,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
def resolveQuoted(
name: String,
resolver: Resolver): Option[NamedExpression] = {
resolve(parseAttributeName(name), output, resolver)
}

/**
* Internal method, used to split attribute name by dot with backticks rule.
* Backticks must appear in pairs, and the quoted string must be a complete name part,
* which means `ab..c`e.f is not allowed.
* Escape character is not supported now, so we can't use backtick inside name part.
*/
private def parseAttributeName(name: String): Seq[String] = {
val e = new AnalysisException(s"syntax error in attribute name: $name")
val nameParts = scala.collection.mutable.ArrayBuffer.empty[String]
val tmp = scala.collection.mutable.ArrayBuffer.empty[Char]
var inBacktick = false
var i = 0
while (i < name.length) {
val char = name(i)
if (inBacktick) {
if (char == '`') {
inBacktick = false
if (i + 1 < name.length && name(i + 1) != '.') throw e
} else {
tmp += char
}
} else {
if (char == '`') {
if (tmp.nonEmpty) throw e
inBacktick = true
} else if (char == '.') {
if (name(i - 1) == '.' || i == name.length - 1) throw e
nameParts += tmp.mkString
tmp.clear()
} else {
tmp += char
}
}
i += 1
}
if (inBacktick) throw e
nameParts += tmp.mkString
nameParts.toSeq
resolve(UnresolvedAttribute.parseAttributeName(name), output, resolver)
}

/**
Expand Down
2 changes: 1 addition & 1 deletion sql/core/src/main/scala/org/apache/spark/sql/Column.scala
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class Column(protected[sql] val expr: Expression) extends Logging {
def this(name: String) = this(name match {
case "*" => UnresolvedStar(None)
case _ if name.endsWith(".*") => UnresolvedStar(Some(name.substring(0, name.length - 2)))
case _ => UnresolvedAttribute(name)
case _ => UnresolvedAttribute.quotedString(name)
})

/** Creates a column based on the given expression. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
*/
implicit class StringToColumn(val sc: StringContext) {
def $(args: Any*): ColumnName = {
new ColumnName(sc.s(args : _*))
new ColumnName(sc.s(args: _*))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,74 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils {

override def sqlContext(): SQLContext = ctx

test("column names with space") {
val df = Seq((1, "a")).toDF("name with space", "name.with.dot")

checkAnswer(
df.select(df("name with space")),
Row(1) :: Nil)

checkAnswer(
df.select($"name with space"),
Row(1) :: Nil)

checkAnswer(
df.select(col("name with space")),
Row(1) :: Nil)

checkAnswer(
df.select("name with space"),
Row(1) :: Nil)

checkAnswer(
df.select(expr("`name with space`")),
Row(1) :: Nil)
}

test("column names with dot") {
val df = Seq((1, "a")).toDF("name with space", "name.with.dot").as("a")

checkAnswer(
df.select(df("`name.with.dot`")),
Row("a") :: Nil)

checkAnswer(
df.select($"`name.with.dot`"),
Row("a") :: Nil)

checkAnswer(
df.select(col("`name.with.dot`")),
Row("a") :: Nil)

checkAnswer(
df.select("`name.with.dot`"),
Row("a") :: Nil)

checkAnswer(
df.select(expr("`name.with.dot`")),
Row("a") :: Nil)

checkAnswer(
df.select(df("a.`name.with.dot`")),
Row("a") :: Nil)

checkAnswer(
df.select($"a.`name.with.dot`"),
Row("a") :: Nil)

checkAnswer(
df.select(col("a.`name.with.dot`")),
Row("a") :: Nil)

checkAnswer(
df.select("a.`name.with.dot`"),
Row("a") :: Nil)

checkAnswer(
df.select(expr("a.`name.with.dot`")),
Row("a") :: Nil)
}

test("alias") {
val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
assert(df.select(df("a").as("b")).columns.head === "b")
Expand Down

0 comments on commit 9be9d38

Please sign in to comment.