Skip to content

Commit

Permalink
[SPARK-11949][SQL] Set field nullable property for GroupingSets to ge…
Browse files Browse the repository at this point in the history
…t correct results for null values

JIRA: https://issues.apache.org/jira/browse/SPARK-11949

The result of cube plan uses incorrect schema. The schema of cube result should set nullable property to true because the grouping expressions will have null values.

Author: Liang-Chi Hsieh <[email protected]>

Closes apache#10038 from viirya/fix-cube.
  • Loading branch information
viirya authored and yhuai committed Dec 1, 2015
1 parent a0af0e3 commit c87531b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,11 @@ class Analyzer(
case other => Alias(other, other.toString)()
}

// TODO: We need to use bitmasks to determine which grouping expressions need to be
// set as nullable. For example, if we have GROUPING SETS ((a,b), a), we do not need
// to change the nullability of a.
val attributeMap = groupByAliases.map(a => (a -> a.toAttribute.withNullability(true))).toMap

val aggregations: Seq[NamedExpression] = x.aggregations.map {
// If an expression is an aggregate (contains a AggregateExpression) then we dont change
// it so that the aggregation is computed on the unmodified value of its argument
Expand All @@ -231,12 +236,13 @@ class Analyzer(
// If not then its a grouping expression and we need to use the modified (with nulls from
// Expand) value of the expression.
case expr => expr.transformDown {
case e => groupByAliases.find(_.child.semanticEquals(e)).map(_.toAttribute).getOrElse(e)
case e =>
groupByAliases.find(_.child.semanticEquals(e)).map(attributeMap(_)).getOrElse(e)
}.asInstanceOf[NamedExpression]
}

val child = Project(x.child.output ++ groupByAliases, x.child)
val groupByAttributes = groupByAliases.map(_.toAttribute)
val groupByAttributes = groupByAliases.map(attributeMap(_))

Aggregate(
groupByAttributes :+ VirtualColumn.groupingIdAttribute,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.spark.sql.functions._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.DecimalType

case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double)

class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
import testImplicits._
Expand Down Expand Up @@ -86,6 +87,15 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
Row(null, 2013, 78000.0) ::
Row(null, null, 113000.0) :: Nil
)

val df0 = sqlContext.sparkContext.parallelize(Seq(
Fact(20151123, 18, 35, "room1", 18.6),
Fact(20151123, 18, 35, "room2", 22.4),
Fact(20151123, 18, 36, "room1", 17.4),
Fact(20151123, 18, 36, "room2", 25.6))).toDF()

val cube0 = df0.cube("date", "hour", "minute", "room_name").agg(Map("temp" -> "avg"))
assert(cube0.where("date IS NULL").count > 0)
}

test("rollup overlapping columns") {
Expand Down

0 comments on commit c87531b

Please sign in to comment.