Skip to content

Commit

Permalink
[SPARK-32659][SQL][FOLLOWUP] Broadcast Array instead of Set in InSubq…
Browse files Browse the repository at this point in the history
…ueryExec

### What changes were proposed in this pull request?

This is a followup of apache#29475.

This PR updates the code to broadcast the Array instead of Set, which was the behavior before apache#29475

### Why are the changes needed?

The size of Set can be much bigger than Array. It's safer to keep the behavior the same as before and build the set at the executor side.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

existing tests

Closes apache#29838 from cloud-fan/followup.

Authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
cloud-fan authored and dongjoon-hyun committed Sep 22, 2020
1 parent 790d9ef commit 6145621
Showing 1 changed file with 7 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions.{AttributeSeq, CreateNamedStruct, Expression, ExprId, InSet, ListQuery, Literal, PlanExpression}
import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, Expression, ExprId, InSet, ListQuery, Literal, PlanExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
Expand Down Expand Up @@ -114,10 +114,10 @@ case class InSubqueryExec(
child: Expression,
plan: BaseSubqueryExec,
exprId: ExprId,
private var resultBroadcast: Broadcast[Set[Any]] = null) extends ExecSubqueryExpression {
private var resultBroadcast: Broadcast[Array[Any]] = null) extends ExecSubqueryExpression {

@transient private var result: Set[Any] = _
@transient private lazy val inSet = InSet(child, result)
@transient private var result: Array[Any] = _
@transient private lazy val inSet = InSet(child, result.toSet)

override def dataType: DataType = BooleanType
override def children: Seq[Expression] = child :: Nil
Expand All @@ -133,14 +133,14 @@ case class InSubqueryExec(
def updateResult(): Unit = {
val rows = plan.executeCollect()
result = if (plan.output.length > 1) {
rows.toSet
rows.asInstanceOf[Array[Any]]
} else {
rows.map(_.get(0, child.dataType)).toSet
rows.map(_.get(0, child.dataType))
}
resultBroadcast = plan.sqlContext.sparkContext.broadcast(result)
}

def values(): Option[Set[Any]] = Option(resultBroadcast).map(_.value)
def values(): Option[Array[Any]] = Option(resultBroadcast).map(_.value)

private def prepareResult(): Unit = {
require(resultBroadcast != null, s"$this has not finished")
Expand Down

0 comments on commit 6145621

Please sign in to comment.