Skip to content

Commit

Permalink
[SPARK-13168][SQL] Collapse adjacent repartition operators
Browse files Browse the repository at this point in the history
Spark SQL should collapse adjacent `Repartition` operators and only keep the last one.

Author: Josh Rosen <[email protected]>

Closes apache#11064 from JoshRosen/collapse-repartition.
  • Loading branch information
JoshRosen authored and Andrew Or committed Feb 4, 2016
1 parent 085f510 commit 33212cb
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ abstract class Optimizer extends RuleExecutor[LogicalPlan] {
PushPredicateThroughAggregate,
ColumnPruning,
// Operator combine
ProjectCollapsing,
CollapseRepartition,
CollapseProject,
CombineFilters,
CombineLimits,
CombineUnions,
Expand Down Expand Up @@ -322,7 +323,7 @@ object ColumnPruning extends Rule[LogicalPlan] {
* Combines two adjacent [[Project]] operators into one and perform alias substitution,
* merging the expressions into one single expression.
*/
object ProjectCollapsing extends Rule[LogicalPlan] {
object CollapseProject extends Rule[LogicalPlan] {

def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
case p @ Project(projectList1, Project(projectList2, child)) =>
Expand Down Expand Up @@ -390,6 +391,16 @@ object ProjectCollapsing extends Rule[LogicalPlan] {
}
}

/**
* Combines adjacent [[Repartition]] operators by keeping only the last one.
*/
object CollapseRepartition extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
case r @ Repartition(numPartitions, shuffle, Repartition(_, _, child)) =>
Repartition(numPartitions, shuffle, child)
}
}

/**
* Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition.
* For example, when the expression is just checking to see if a string starts with a given
Expand Down Expand Up @@ -857,6 +868,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
/**
* Splits join condition expressions into three categories based on the attributes required
* to evaluate them.
*
* @return (canEvaluateInLeft, canEvaluateInRight, haveToEvaluateInBoth)
*/
private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor

class ProjectCollapsingSuite extends PlanTest {
class CollapseProjectSuite extends PlanTest {
object Optimize extends RuleExecutor[LogicalPlan] {
val batches =
Batch("Subqueries", FixedPoint(10), EliminateSubQueries) ::
Batch("ProjectCollapsing", Once, ProjectCollapsing) :: Nil
Batch("CollapseProject", Once, CollapseProject) :: Nil
}

val testRelation = LocalRelation('a.int, 'b.int)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class FilterPushdownSuite extends PlanTest {
PushPredicateThroughGenerate,
PushPredicateThroughAggregate,
ColumnPruning,
ProjectCollapsing) :: Nil
CollapseProject) :: Nil
}

val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class JoinOrderSuite extends PlanTest {
PushPredicateThroughGenerate,
PushPredicateThroughAggregate,
ColumnPruning,
ProjectCollapsing) :: Nil
CollapseProject) :: Nil

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{execution, Row, SQLConf}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Literal, SortOrder}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition}
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, SortMergeJoin}
import org.apache.spark.sql.functions._
Expand Down Expand Up @@ -223,6 +222,18 @@ class PlannerSuite extends SharedSQLContext {
}
}

test("collapse adjacent repartitions") {
val doubleRepartitioned = testData.repartition(10).repartition(20).coalesce(5)
def countRepartitions(plan: LogicalPlan): Int = plan.collect { case r: Repartition => r }.length
assert(countRepartitions(doubleRepartitioned.queryExecution.logical) === 3)
assert(countRepartitions(doubleRepartitioned.queryExecution.optimizedPlan) === 1)
doubleRepartitioned.queryExecution.optimizedPlan match {
case r: Repartition =>
assert(r.numPartitions === 5)
assert(r.shuffle === false)
}
}

// --- Unit tests of EnsureRequirements ---------------------------------------------------------

// When it comes to testing whether EnsureRequirements properly ensures distribution requirements,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import org.apache.spark.Logging
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression, SortOrder}
import org.apache.spark.sql.catalyst.optimizer.ProjectCollapsing
import org.apache.spark.sql.catalyst.optimizer.CollapseProject
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
import org.apache.spark.sql.execution.datasources.LogicalRelation
Expand Down Expand Up @@ -188,7 +188,7 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
// The `WidenSetOperationTypes` analysis rule may introduce extra `Project`s over
// `Aggregate`s to perform type casting. This rule merges these `Project`s into
// `Aggregate`s.
ProjectCollapsing,
CollapseProject,

// Used to handle other auxiliary `Project`s added by analyzer (e.g.
// `ResolveAggregateFunctions` rule)
Expand Down

0 comments on commit 33212cb

Please sign in to comment.