forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-16929] Improve performance when check speculatable tasks.
## What changes were proposed in this pull request? 1. Use a MedianHeap to record durations of successful tasks. When check speculatable tasks, we can get the median duration with O(1) time complexity. 2. `checkSpeculatableTasks` will synchronize `TaskSchedulerImpl`. If `checkSpeculatableTasks` doesn't finish with 100ms, then the possibility exists for that thread to release and then immediately re-acquire the lock. Change `scheduleAtFixedRate` to be `scheduleWithFixedDelay` when call method of `checkSpeculatableTasks`. ## How was this patch tested? Added MedianHeapSuite. Author: jinxing <[email protected]> Closes apache#16867 from jinxing64/SPARK-16929.
- Loading branch information
1 parent
bb823ca
commit 19596c2
Showing
5 changed files
with
176 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
93 changes: 93 additions & 0 deletions
93
core/src/main/scala/org/apache/spark/util/collection/MedianHeap.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.util.collection | ||
|
||
import scala.collection.mutable.PriorityQueue | ||
|
||
/** | ||
* MedianHeap is designed to be used to quickly track the median of a group of numbers | ||
* that may contain duplicates. Inserting a new number has O(log n) time complexity and | ||
* determining the median has O(1) time complexity. | ||
* The basic idea is to maintain two heaps: a smallerHalf and a largerHalf. The smallerHalf | ||
* stores the smaller half of all numbers while the largerHalf stores the larger half. | ||
* The sizes of two heaps need to be balanced each time when a new number is inserted so | ||
* that their sizes will not be different by more than 1. Therefore each time when | ||
* findMedian() is called we check if two heaps have the same size. If they do, we should | ||
* return the average of the two top values of heaps. Otherwise we return the top of the | ||
* heap which has one more element. | ||
*/ | ||
private[spark] class MedianHeap(implicit val ord: Ordering[Double]) { | ||
|
||
/** | ||
* Stores all the numbers less than the current median in a smallerHalf, | ||
* i.e median is the maximum, at the root. | ||
*/ | ||
private[this] var smallerHalf = PriorityQueue.empty[Double](ord) | ||
|
||
/** | ||
* Stores all the numbers greater than the current median in a largerHalf, | ||
* i.e median is the minimum, at the root. | ||
*/ | ||
private[this] var largerHalf = PriorityQueue.empty[Double](ord.reverse) | ||
|
||
def isEmpty(): Boolean = { | ||
smallerHalf.isEmpty && largerHalf.isEmpty | ||
} | ||
|
||
def size(): Int = { | ||
smallerHalf.size + largerHalf.size | ||
} | ||
|
||
def insert(x: Double): Unit = { | ||
// If both heaps are empty, we arbitrarily insert it into a heap, let's say, the largerHalf. | ||
if (isEmpty) { | ||
largerHalf.enqueue(x) | ||
} else { | ||
// If the number is larger than current median, it should be inserted into largerHalf, | ||
// otherwise smallerHalf. | ||
if (x > median) { | ||
largerHalf.enqueue(x) | ||
} else { | ||
smallerHalf.enqueue(x) | ||
} | ||
} | ||
rebalance() | ||
} | ||
|
||
private[this] def rebalance(): Unit = { | ||
if (largerHalf.size - smallerHalf.size > 1) { | ||
smallerHalf.enqueue(largerHalf.dequeue()) | ||
} | ||
if (smallerHalf.size - largerHalf.size > 1) { | ||
largerHalf.enqueue(smallerHalf.dequeue) | ||
} | ||
} | ||
|
||
def median: Double = { | ||
if (isEmpty) { | ||
throw new NoSuchElementException("MedianHeap is empty.") | ||
} | ||
if (largerHalf.size == smallerHalf.size) { | ||
(largerHalf.head + smallerHalf.head) / 2.0 | ||
} else if (largerHalf.size > smallerHalf.size) { | ||
largerHalf.head | ||
} else { | ||
smallerHalf.head | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
core/src/test/scala/org/apache/spark/util/collection/MedianHeapSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.util.collection | ||
|
||
import java.util.NoSuchElementException | ||
|
||
import org.apache.spark.SparkFunSuite | ||
|
||
class MedianHeapSuite extends SparkFunSuite { | ||
|
||
test("If no numbers in MedianHeap, NoSuchElementException is thrown.") { | ||
val medianHeap = new MedianHeap() | ||
intercept[NoSuchElementException] { | ||
medianHeap.median | ||
} | ||
} | ||
|
||
test("Median should be correct when size of MedianHeap is even") { | ||
val array = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) | ||
val medianHeap = new MedianHeap() | ||
array.foreach(medianHeap.insert(_)) | ||
assert(medianHeap.size() === 10) | ||
assert(medianHeap.median === 4.5) | ||
} | ||
|
||
test("Median should be correct when size of MedianHeap is odd") { | ||
val array = Array(0, 1, 2, 3, 4, 5, 6, 7, 8) | ||
val medianHeap = new MedianHeap() | ||
array.foreach(medianHeap.insert(_)) | ||
assert(medianHeap.size() === 9) | ||
assert(medianHeap.median === 4) | ||
} | ||
|
||
test("Median should be correct though there are duplicated numbers inside.") { | ||
val array = Array(0, 0, 1, 1, 2, 3, 4) | ||
val medianHeap = new MedianHeap() | ||
array.foreach(medianHeap.insert(_)) | ||
assert(medianHeap.size === 7) | ||
assert(medianHeap.median === 1) | ||
} | ||
|
||
test("Median should be correct when input data is skewed.") { | ||
val medianHeap = new MedianHeap() | ||
(0 until 10).foreach(_ => medianHeap.insert(5)) | ||
assert(medianHeap.median === 5) | ||
(0 until 100).foreach(_ => medianHeap.insert(10)) | ||
assert(medianHeap.median === 10) | ||
(0 until 1000).foreach(_ => medianHeap.insert(0)) | ||
assert(medianHeap.median === 0) | ||
} | ||
} |