diff --git a/README.md b/README.md index 87d53da8da719..7d3f9d4845d24 100644 --- a/README.md +++ b/README.md @@ -108,3 +108,4 @@ project's open source license. Whether or not you state this explicitly, by submitting any copyrighted material via pull request, email, or other means you agree to license the material under the project's open source license and warrant that you have the legal authority to do so. + diff --git a/assembly/pom.xml b/assembly/pom.xml index 47a110ca6cbd7..f9c8b9708a218 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index feaed6d2b0539..271ab6ce56e8c 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 8d9f0e386f280..2ec8fa6d0afa2 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index ef97fa85fab17..e67390cfd1ee9 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -58,9 +58,9 @@ import org.apache.spark.partial.{ApproximateEvaluator, PartialResult} import org.apache.spark.rdd._ import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.{StandaloneSchedulerBackend, SparkDeploySchedulerBackend, - ClusterScheduler, Schedulable, SchedulingMode} + ClusterScheduler} import org.apache.spark.scheduler.local.LocalScheduler -import org.apache.spark.scheduler.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend} +import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend} import org.apache.spark.storage.{StorageUtils, BlockManagerSource} import org.apache.spark.ui.SparkUI import org.apache.spark.util.{ClosureCleaner, Utils, MetadataCleaner, TimeStampedHashMap} diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index feb2cab578d35..d7b45d4caa253 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -68,6 +68,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { def map[R](f: JFunction[T, R]): JavaRDD[R] = new JavaRDD(rdd.map(f)(f.returnType()))(f.returnType()) + /** + * Return a new RDD by applying a function to each partition of this RDD, while tracking the index + * of the original partition. + */ + def mapPartitionsWithIndex[R: ClassManifest]( + f: JFunction2[Int, java.util.Iterator[T], java.util.Iterator[R]], + preservesPartitioning: Boolean = false): JavaRDD[R] = + new JavaRDD(rdd.mapPartitionsWithIndex(((a,b) => f(a,asJavaIterator(b))), + preservesPartitioning)) + /** * Return a new RDD by applying a function to all elements of this RDD. */ diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index ea4eeb7dbf819..731ef90c90ea3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -756,24 +756,42 @@ abstract class RDD[T: ClassTag]( } /** - * Take the first num elements of the RDD. This currently scans the partitions *one by one*, so - * it will be slow if a lot of partitions are required. In that case, use collect() to get the - * whole RDD instead. + * Take the first num elements of the RDD. It works by first scanning one partition, and use the + * results from that partition to estimate the number of additional partitions needed to satisfy + * the limit. */ def take(num: Int): Array[T] = { if (num == 0) { return new Array[T](0) } + val buf = new ArrayBuffer[T] - var p = 0 - while (buf.size < num && p < partitions.size) { + val totalParts = this.partitions.length + var partsScanned = 0 + while (buf.size < num && partsScanned < totalParts) { + // The number of partitions to try in this iteration. It is ok for this number to be + // greater than totalParts because we actually cap it at totalParts in runJob. + var numPartsToTry = 1 + if (partsScanned > 0) { + // If we didn't find any rows after the first iteration, just try all partitions next. + // Otherwise, interpolate the number of partitions we need to try, but overestimate it + // by 50%. + if (buf.size == 0) { + numPartsToTry = totalParts - 1 + } else { + numPartsToTry = (1.5 * num * partsScanned / buf.size).toInt + } + } + numPartsToTry = math.max(0, numPartsToTry) // guard against negative num of partitions + val left = num - buf.size - val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, Array(p), true) - buf ++= res(0) - if (buf.size == num) - return buf.toArray - p += 1 + val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts) + val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p, allowLocal = true) + + res.foreach(buf ++= _.take(num - buf.size)) + partsScanned += numPartsToTry } + return buf.toArray } diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index dca84db5973a8..693d8a7c5db42 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -29,7 +29,6 @@ import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.executor.TaskMetrics import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult} -import org.apache.spark.scheduler.cluster.TaskInfo import org.apache.spark.storage.{BlockManager, BlockManagerMaster} import org.apache.spark.util.{MetadataCleaner, TimeStampedHashMap} diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala index 0d996706489f0..10ff1b4376268 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala @@ -19,7 +19,6 @@ package org.apache.spark.scheduler import java.util.Properties -import org.apache.spark.scheduler.cluster.TaskInfo import scala.collection.mutable.Map import org.apache.spark._ diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala index c8b78bf00a83c..3628b1b078de8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala @@ -30,7 +30,6 @@ import scala.io.Source import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.executor.TaskMetrics -import org.apache.spark.scheduler.cluster.TaskInfo // Used to record runtime information for each job, including RDD graph // tasks' start/stop shuffle information and information from outside diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala similarity index 96% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/Pool.scala rename to core/src/main/scala/org/apache/spark/scheduler/Pool.scala index 35b32600da6e0..c9a66b3a75bf4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -15,13 +15,13 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import org.apache.spark.Logging -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode /** * An Schedulable entity that represent collection of Pools or TaskSetManagers diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/Schedulable.scala b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala similarity index 93% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/Schedulable.scala rename to core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala index f4726450ec239..857adaef5a0bb 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/Schedulable.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import scala.collection.mutable.ArrayBuffer /** diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulableBuilder.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala similarity index 99% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulableBuilder.scala rename to core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala index 114617c51a3c0..4e25086ec91e0 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulableBuilder.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler import java.io.{FileInputStream, InputStream} import java.util.{NoSuchElementException, Properties} diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulingAlgorithm.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala similarity index 98% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulingAlgorithm.scala rename to core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala index cbeed4731a79c..3418640b8c59e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulingAlgorithm.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler /** * An interface for sort algorithm diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulingMode.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulingMode.scala similarity index 96% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulingMode.scala rename to core/src/main/scala/org/apache/spark/scheduler/SchedulingMode.scala index 34811389a02f7..0a786deb161ae 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulingMode.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulingMode.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler /** * "FAIR" and "FIFO" determines which policy is used diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala index c3cf4b8907ebb..62b521ad45d39 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala @@ -18,7 +18,6 @@ package org.apache.spark.scheduler import java.util.Properties -import org.apache.spark.scheduler.cluster.TaskInfo import org.apache.spark.util.{Utils, Distribution} import org.apache.spark.{Logging, SparkContext, TaskEndReason} import org.apache.spark.executor.TaskMetrics diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala index 72cb1c9ce8a2f..b6f11969e575e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala @@ -17,8 +17,8 @@ package org.apache.spark.scheduler -import org.apache.spark.scheduler.cluster.TaskInfo import scala.collection._ + import org.apache.spark.executor.TaskMetrics case class StageInfo( diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala similarity index 97% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/TaskDescription.scala rename to core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala index 309ac2f6c9d05..5190d234d4ee5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskDescription.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler import java.nio.ByteBuffer import org.apache.spark.util.SerializableBuffer diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala similarity index 97% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/TaskInfo.scala rename to core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala index 9685fb1a67b7a..7c2a422affbbf 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskLocality.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala similarity index 96% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/TaskLocality.scala rename to core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala index 5d4130e14a8ad..47b0f387aa0c5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskLocality.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler private[spark] object TaskLocality diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index 63be8ba3f58ee..7c2a9f03d7ad8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -17,10 +17,11 @@ package org.apache.spark.scheduler -import org.apache.spark.scheduler.cluster.Pool -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode + /** * Low-level task scheduler interface, implemented by both ClusterScheduler and LocalScheduler. + * Each TaskScheduler schedulers task for a single SparkContext. * These schedulers get sets of tasks submitted to them from the DAGScheduler for each stage, * and are responsible for sending the tasks to the cluster, running them, retrying if there * are failures, and mitigating stragglers. They return events to the DAGScheduler through diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerListener.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerListener.scala index 83be051c1a809..593fa9fb93a55 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerListener.scala @@ -17,7 +17,6 @@ package org.apache.spark.scheduler -import org.apache.spark.scheduler.cluster.TaskInfo import scala.collection.mutable.Map import org.apache.spark.TaskEndReason diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala similarity index 95% rename from core/src/main/scala/org/apache/spark/scheduler/cluster/TaskSetManager.scala rename to core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 648a3ef922c90..f192b0b7a41e3 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -15,12 +15,11 @@ * limitations under the License. */ -package org.apache.spark.scheduler.cluster +package org.apache.spark.scheduler import java.nio.ByteBuffer import org.apache.spark.TaskState.TaskState -import org.apache.spark.scheduler.TaskSet /** * Tracks and schedules the tasks within a single TaskSet. This class keeps track of the status of diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala index 919acce828d9c..a6dee604b717c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala @@ -26,7 +26,7 @@ import scala.collection.mutable.HashSet import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.scheduler._ -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import java.nio.ByteBuffer import java.util.concurrent.atomic.AtomicLong import java.util.{TimerTask, Timer} diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala index 0ac3d7bcfdd2c..411e49b021b2e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala @@ -25,15 +25,12 @@ import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import scala.math.max import scala.math.min +import scala.Some -import org.apache.spark.{FetchFailed, Logging, Resubmitted, SparkEnv, Success, TaskEndReason, TaskState} -import org.apache.spark.{ExceptionFailure, SparkException, TaskResultTooBigFailure} +import org.apache.spark.{ExceptionFailure, FetchFailed, Logging, Resubmitted, SparkEnv, + SparkException, Success, TaskEndReason, TaskResultTooBigFailure, TaskState} import org.apache.spark.TaskState.TaskState import org.apache.spark.scheduler._ -import scala.Some -import org.apache.spark.FetchFailed -import org.apache.spark.ExceptionFailure -import org.apache.spark.TaskResultTooBigFailure import org.apache.spark.util.{SystemClock, Clock} diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneClusterMessage.scala index 9c36d221f69de..c0b836bf1a021 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneClusterMessage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneClusterMessage.scala @@ -20,6 +20,7 @@ package org.apache.spark.scheduler.cluster import java.nio.ByteBuffer import org.apache.spark.TaskState.TaskState +import org.apache.spark.scheduler.TaskDescription import org.apache.spark.util.{Utils, SerializableBuffer} diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index 49f668eb325a1..b6f0ec961a69c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -28,6 +28,7 @@ import akka.pattern.ask import akka.remote.{AssociationErrorEvent, DisassociatedEvent, RemotingLifecycleEvent} import org.apache.spark.{SparkException, Logging, TaskState} +import org.apache.spark.scheduler.TaskDescription import org.apache.spark.scheduler.cluster.StandaloneClusterMessages._ import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala similarity index 98% rename from core/src/main/scala/org/apache/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala rename to core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index babe875fa1c0b..bf4040fafce46 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -15,22 +15,22 @@ * limitations under the License. */ -package org.apache.spark.scheduler.mesos +package org.apache.spark.scheduler.cluster.mesos -import com.google.protobuf.ByteString +import java.io.File +import java.util.{ArrayList => JArrayList, List => JList} +import java.util.Collections + +import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} +import scala.collection.JavaConversions._ +import com.google.protobuf.ByteString import org.apache.mesos.{Scheduler => MScheduler} import org.apache.mesos._ import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, TaskState => MesosTaskState, _} -import org.apache.spark.{SparkException, Logging, SparkContext} -import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} -import scala.collection.JavaConversions._ -import java.io.File -import org.apache.spark.scheduler.cluster._ -import java.util.{ArrayList => JArrayList, List => JList} -import java.util.Collections -import org.apache.spark.TaskState +import org.apache.spark.{SparkException, Logging, SparkContext, TaskState} +import org.apache.spark.scheduler.cluster.{ClusterScheduler, StandaloneSchedulerBackend} /** * A SchedulerBackend that runs tasks on Mesos, but uses "coarse-grained" tasks, where it holds diff --git a/core/src/main/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala similarity index 97% rename from core/src/main/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackend.scala rename to core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 541f86e3381db..50cbc2ca92e7e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -15,22 +15,24 @@ * limitations under the License. */ -package org.apache.spark.scheduler.mesos +package org.apache.spark.scheduler.cluster.mesos -import com.google.protobuf.ByteString +import java.io.File +import java.util.{ArrayList => JArrayList, List => JList} +import java.util.Collections + +import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} +import scala.collection.JavaConversions._ +import com.google.protobuf.ByteString import org.apache.mesos.{Scheduler => MScheduler} import org.apache.mesos._ import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, TaskState => MesosTaskState, _} -import org.apache.spark.{SparkException, Logging, SparkContext} -import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} -import scala.collection.JavaConversions._ -import java.io.File -import org.apache.spark.scheduler.cluster._ -import java.util.{ArrayList => JArrayList, List => JList} -import java.util.Collections -import org.apache.spark.TaskState +import org.apache.spark.{Logging, SparkException, SparkContext, TaskState} +import org.apache.spark.scheduler.TaskDescription +import org.apache.spark.scheduler.cluster.{ClusterScheduler, ExecutorExited, ExecutorLossReason} +import org.apache.spark.scheduler.cluster.{SchedulerBackend, SlaveLost, WorkerOffer} import org.apache.spark.util.Utils /** diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalScheduler.scala index 8cb4d1396fad2..e29438f4ed64e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalScheduler.scala @@ -31,8 +31,7 @@ import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.executor.ExecutorURLClassLoader import org.apache.spark.scheduler._ -import org.apache.spark.scheduler.cluster._ -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import akka.actor._ import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalTaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalTaskSetManager.scala index e52cb998bdf42..a2fda4c124586 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalTaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalTaskSetManager.scala @@ -23,8 +23,8 @@ import scala.collection.mutable.HashMap import org.apache.spark.{ExceptionFailure, Logging, SparkEnv, Success, TaskState} import org.apache.spark.TaskState.TaskState -import org.apache.spark.scheduler.{Task, TaskResult, TaskSet} -import org.apache.spark.scheduler.cluster.{Schedulable, TaskDescription, TaskInfo, TaskLocality, TaskSetManager} +import org.apache.spark.scheduler.{Schedulable, Task, TaskDescription, TaskInfo, TaskLocality, + TaskResult, TaskSet, TaskSetManager} private[spark] class LocalTaskSetManager(sched: LocalScheduler, val taskSet: TaskSet) diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala index 3b3b2342fa213..77a39c71ed2c1 100644 --- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala +++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala @@ -30,10 +30,10 @@ import org.apache.spark.util.{SizeEstimator, Utils} private class MemoryStore(blockManager: BlockManager, maxMemory: Long) extends BlockStore(blockManager) { - case class Entry(value: Any, size: Long, deserialized: Boolean, var dropPending: Boolean = false) + case class Entry(value: Any, size: Long, deserialized: Boolean) private val entries = new LinkedHashMap[String, Entry](32, 0.75f, true) - private var currentMemory = 0L + @volatile private var currentMemory = 0L // Object used to ensure that only one thread is putting blocks and if necessary, dropping // blocks from the memory store. private val putLock = new Object() @@ -110,9 +110,8 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long) override def remove(blockId: String): Boolean = { entries.synchronized { - val entry = entries.get(blockId) + val entry = entries.remove(blockId) if (entry != null) { - entries.remove(blockId) currentMemory -= entry.size logInfo("Block %s of size %d dropped from memory (free %d)".format( blockId, entry.size, freeMemory)) @@ -126,6 +125,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long) override def clear() { entries.synchronized { entries.clear() + currentMemory = 0 } logInfo("MemoryStore cleared") } @@ -160,8 +160,10 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long) putLock.synchronized { if (ensureFreeSpace(blockId, size)) { val entry = new Entry(value, size, deserialized) - entries.synchronized { entries.put(blockId, entry) } - currentMemory += size + entries.synchronized { + entries.put(blockId, entry) + currentMemory += size + } if (deserialized) { logInfo("Block %s stored as values to memory (estimated size %s, free %s)".format( blockId, Utils.bytesToString(size), Utils.bytesToString(freeMemory))) diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala index 3ec9760ed0306..453394dfda1ba 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala @@ -21,7 +21,7 @@ import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ -import org.apache.spark.scheduler.cluster.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode /** diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsUI.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsUI.scala index d1868dcf78628..42e9be6e19254 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsUI.scala @@ -26,8 +26,8 @@ import org.eclipse.jetty.server.Handler import org.apache.spark.{ExceptionFailure, Logging, SparkContext} import org.apache.spark.executor.TaskMetrics -import org.apache.spark.scheduler.cluster.TaskInfo import org.apache.spark.scheduler.{SparkListenerTaskStart, SparkListenerTaskEnd, SparkListener} +import org.apache.spark.scheduler.TaskInfo import org.apache.spark.ui.JettyUtils._ import org.apache.spark.ui.Page.Executors import org.apache.spark.ui.UIUtils diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/IndexPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/IndexPage.scala index 3b428effafad5..b39c0e9769d48 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/IndexPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/IndexPage.scala @@ -21,7 +21,7 @@ import javax.servlet.http.HttpServletRequest import scala.xml.{NodeSeq, Node} -import org.apache.spark.scheduler.cluster.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode import org.apache.spark.ui.Page._ import org.apache.spark.ui.UIUtils._ diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala index 5d46f38a2a4b9..eb3b4e8522804 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala @@ -21,10 +21,8 @@ import scala.Seq import scala.collection.mutable.{ListBuffer, HashMap, HashSet} import org.apache.spark.{ExceptionFailure, SparkContext, Success} -import org.apache.spark.scheduler._ -import org.apache.spark.scheduler.cluster.TaskInfo import org.apache.spark.executor.TaskMetrics -import collection.mutable +import org.apache.spark.scheduler._ /** * Tracks task-level information to be displayed in the UI. diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressUI.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressUI.scala index 54e273fd8bee7..c1ee2f3d00d66 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressUI.scala @@ -31,8 +31,9 @@ import scala.collection.mutable.{HashSet, ListBuffer, HashMap, ArrayBuffer} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.{ExceptionFailure, SparkContext, Success} import org.apache.spark.scheduler._ -import org.apache.spark.scheduler.cluster.SchedulingMode -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import collection.mutable +import org.apache.spark.scheduler.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.util.Utils /** Web UI showing progress status of all jobs in the given SparkContext. */ diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala index b3d3666944b4c..06810d8dbc292 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala @@ -21,8 +21,7 @@ import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import scala.xml.Node -import org.apache.spark.scheduler.Stage -import org.apache.spark.scheduler.cluster.Schedulable +import org.apache.spark.scheduler.{Schedulable, Stage} import org.apache.spark.ui.UIUtils /** Table showing list of pools */ diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index a9969ab1c0870..163a3746ea00e 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -23,12 +23,12 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node +import org.apache.spark.{ExceptionFailure} +import org.apache.spark.executor.TaskMetrics import org.apache.spark.ui.UIUtils._ import org.apache.spark.ui.Page._ import org.apache.spark.util.{Utils, Distribution} -import org.apache.spark.{ExceptionFailure} -import org.apache.spark.scheduler.cluster.TaskInfo -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.scheduler.TaskInfo /** Page showing statistics and task list for a given stage */ private[spark] class StagePage(parent: JobProgressUI) { diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala index 32776eaa25ae3..07db8622da471 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala @@ -22,8 +22,7 @@ import java.util.Date import scala.xml.Node import scala.collection.mutable.HashSet -import org.apache.spark.scheduler.cluster.{SchedulingMode, TaskInfo} -import org.apache.spark.scheduler.Stage +import org.apache.spark.scheduler.{SchedulingMode, Stage, TaskInfo} import org.apache.spark.ui.UIUtils import org.apache.spark.util.Utils diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index c1df5e151e8ac..6d1bc5e296e06 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -25,7 +25,6 @@ import org.apache.spark.SparkContext._ import org.apache.spark.rdd._ import scala.collection.parallel.mutable import org.apache.spark._ -import org.apache.spark.rdd.CoalescedRDDPartition class RDDSuite extends FunSuite with SharedSparkContext { @@ -321,6 +320,44 @@ class RDDSuite extends FunSuite with SharedSparkContext { for (i <- 0 until sample.size) assert(sample(i) === checkSample(i)) } + test("take") { + var nums = sc.makeRDD(Range(1, 1000), 1) + assert(nums.take(0).size === 0) + assert(nums.take(1) === Array(1)) + assert(nums.take(3) === Array(1, 2, 3)) + assert(nums.take(500) === (1 to 500).toArray) + assert(nums.take(501) === (1 to 501).toArray) + assert(nums.take(999) === (1 to 999).toArray) + assert(nums.take(1000) === (1 to 999).toArray) + + nums = sc.makeRDD(Range(1, 1000), 2) + assert(nums.take(0).size === 0) + assert(nums.take(1) === Array(1)) + assert(nums.take(3) === Array(1, 2, 3)) + assert(nums.take(500) === (1 to 500).toArray) + assert(nums.take(501) === (1 to 501).toArray) + assert(nums.take(999) === (1 to 999).toArray) + assert(nums.take(1000) === (1 to 999).toArray) + + nums = sc.makeRDD(Range(1, 1000), 100) + assert(nums.take(0).size === 0) + assert(nums.take(1) === Array(1)) + assert(nums.take(3) === Array(1, 2, 3)) + assert(nums.take(500) === (1 to 500).toArray) + assert(nums.take(501) === (1 to 501).toArray) + assert(nums.take(999) === (1 to 999).toArray) + assert(nums.take(1000) === (1 to 999).toArray) + + nums = sc.makeRDD(Range(1, 1000), 1000) + assert(nums.take(0).size === 0) + assert(nums.take(1) === Array(1)) + assert(nums.take(3) === Array(1, 2, 3)) + assert(nums.take(500) === (1 to 500).toArray) + assert(nums.take(501) === (1 to 501).toArray) + assert(nums.take(999) === (1 to 999).toArray) + assert(nums.take(1000) === (1 to 999).toArray) + } + test("top with predefined ordering") { val nums = Array.range(1, 100000) val ints = sc.makeRDD(scala.util.Random.shuffle(nums), 2) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 94f66c94c685a..9ed591e494a16 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -32,9 +32,9 @@ import org.apache.spark.{Dependency, ShuffleDependency, OneToOneDependency} import org.apache.spark.{FetchFailed, Success, TaskEndReason} import org.apache.spark.storage.{BlockManagerId, BlockManagerMaster} -import org.apache.spark.scheduler.cluster.Pool -import org.apache.spark.scheduler.cluster.SchedulingMode -import org.apache.spark.scheduler.cluster.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.Pool +import org.apache.spark.scheduler.SchedulingMode +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode /** * Tests for DAGScheduler. These tests directly call the event processing functions in DAGScheduler diff --git a/docs/_config.yml b/docs/_config.yml index ad851673a5ed3..02067f9750f48 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -3,8 +3,8 @@ markdown: kramdown # These allow the documentation to be updated with nerw releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 0.8.0-SNAPSHOT -SPARK_VERSION_SHORT: 0.8.0 +SPARK_VERSION: 0.9.0-incubating-SNAPSHOT +SPARK_VERSION_SHORT: 0.9.0-SNAPSHOT SCALA_VERSION: 2.10 MESOS_VERSION: 0.13.0 SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 6b7d202a88174..1190ed47f6bcc 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -23,6 +23,7 @@ import logging import os +import pipes import random import shutil import subprocess @@ -36,6 +37,9 @@ from boto.ec2.blockdevicemapping import BlockDeviceMapping, EBSBlockDeviceType from boto import ec2 +class UsageError(Exception): + pass + # A URL prefix from which to fetch AMI information AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/v2/ami-list" @@ -103,11 +107,7 @@ def parse_args(): parser.print_help() sys.exit(1) (action, cluster_name) = args - if opts.identity_file == None and action in ['launch', 'login', 'start']: - print >> stderr, ("ERROR: The -i or --identity-file argument is " + - "required for " + action) - sys.exit(1) - + # Boto config check # http://boto.cloudhackers.com/en/latest/boto_config_tut.html home_dir = os.getenv('HOME') @@ -390,10 +390,18 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): master = master_nodes[0].public_dns_name if deploy_ssh_key: - print "Copying SSH key %s to master..." % opts.identity_file - ssh(master, opts, 'mkdir -p ~/.ssh') - scp(master, opts, opts.identity_file, '~/.ssh/id_rsa') - ssh(master, opts, 'chmod 600 ~/.ssh/id_rsa') + print "Generating cluster's SSH key on master..." + key_setup = """ + [ -f ~/.ssh/id_rsa ] || + (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys) + """ + ssh(master, opts, key_setup) + dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh']) + print "Transferring cluster's SSH key to slaves..." + for slave in slave_nodes: + print slave.public_dns_name + ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'shark', 'ephemeral-hdfs', 'persistent-hdfs', 'mapreduce', 'spark-standalone'] @@ -535,18 +543,33 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): dest.write(text) dest.close() # rsync the whole directory over to the master machine - command = (("rsync -rv -e 'ssh -o StrictHostKeyChecking=no -i %s' " + - "'%s/' '%s@%s:/'") % (opts.identity_file, tmp_dir, opts.user, active_master)) - subprocess.check_call(command, shell=True) + command = [ + 'rsync', '-rv', + '-e', stringify_command(ssh_command(opts)), + "%s/" % tmp_dir, + "%s@%s:/" % (opts.user, active_master) + ] + subprocess.check_call(command) # Remove the temp directory we created above shutil.rmtree(tmp_dir) -# Copy a file to a given host through scp, throwing an exception if scp fails -def scp(host, opts, local_file, dest_file): - subprocess.check_call( - "scp -q -o StrictHostKeyChecking=no -i %s '%s' '%s@%s:%s'" % - (opts.identity_file, local_file, opts.user, host, dest_file), shell=True) +def stringify_command(parts): + if isinstance(parts, str): + return parts + else: + return ' '.join(map(pipes.quote, parts)) + + +def ssh_args(opts): + parts = ['-o', 'StrictHostKeyChecking=no'] + if opts.identity_file is not None: + parts += ['-i', opts.identity_file] + return parts + + +def ssh_command(opts): + return ['ssh'] + ssh_args(opts) # Run a command on a host through ssh, retrying up to two times @@ -556,18 +579,42 @@ def ssh(host, opts, command): while True: try: return subprocess.check_call( - "ssh -t -o StrictHostKeyChecking=no -i %s %s@%s '%s'" % - (opts.identity_file, opts.user, host, command), shell=True) + ssh_command(opts) + ['-t', '%s@%s' % (opts.user, host), stringify_command(command)]) except subprocess.CalledProcessError as e: if (tries > 2): - raise e - print "Couldn't connect to host {0}, waiting 30 seconds".format(e) + # If this was an ssh failure, provide the user with hints. + if e.returncode == 255: + raise UsageError("Failed to SSH to remote host {0}.\nPlease check that you have provided the correct --identity-file and --key-pair parameters and try again.".format(host)) + else: + raise e + print >> stderr, "Error executing remote command, retrying after 30 seconds: {0}".format(e) time.sleep(30) tries = tries + 1 +def ssh_read(host, opts, command): + return subprocess.check_output( + ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)]) +def ssh_write(host, opts, command, input): + tries = 0 + while True: + proc = subprocess.Popen( + ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)], + stdin=subprocess.PIPE) + proc.stdin.write(input) + proc.stdin.close() + status = proc.wait() + if status == 0: + break + elif (tries > 2): + raise RuntimeError("ssh_write failed with error %s" % proc.returncode) + else: + print >> stderr, "Error {0} while executing remote command, retrying after 30 seconds".format(status) + time.sleep(30) + tries = tries + 1 + # Gets a list of zones to launch instances in def get_zones(conn, opts): @@ -586,7 +633,7 @@ def get_partition(total, num_partitions, current_partitions): return num_slaves_this_zone -def main(): +def real_main(): (opts, action, cluster_name) = parse_args() try: conn = ec2.connect_to_region(opts.region) @@ -669,11 +716,11 @@ def main(): conn, opts, cluster_name) master = master_nodes[0].public_dns_name print "Logging into master " + master + "..." - proxy_opt = "" + proxy_opt = [] if opts.proxy_port != None: - proxy_opt = "-D " + opts.proxy_port - subprocess.check_call("ssh -o StrictHostKeyChecking=no -i %s %s %s@%s" % - (opts.identity_file, proxy_opt, opts.user, master), shell=True) + proxy_opt = ['-D', opts.proxy_port] + subprocess.check_call( + ssh_command(opts) + proxy_opt + ['-t', "%s@%s" % (opts.user, master)]) elif action == "get-master": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) @@ -715,6 +762,13 @@ def main(): sys.exit(1) +def main(): + try: + real_main() + except UsageError, e: + print >> stderr, "\nError:\n", e + + if __name__ == "__main__": logging.basicConfig() main() diff --git a/examples/pom.xml b/examples/pom.xml index ca06a9ad8dee7..3c0a8d06edf32 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index f4190148b18f4..4cff5e370017f 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index b9e872255ec63..844ba74252d0f 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT pom Spark Project Parent POM http://spark.incubator.apache.org/ @@ -552,7 +552,6 @@ true -unchecked - -optimise -deprecation diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index b5e65a1856903..67d03f987f31d 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -79,9 +79,10 @@ object SparkBuild extends Build { def sharedSettings = Defaults.defaultSettings ++ Seq( organization := "org.apache.spark", - version := "0.8.0-SNAPSHOT", + version := "0.9.0-incubating-SNAPSHOT", scalaVersion := "2.10.2", - scalacOptions := Seq("-unchecked", "-optimize", "-deprecation", "-target:" + SCALAC_JVM_VERSION), + scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation", + "-target:" + SCALAC_JVM_VERSION), javacOptions := Seq("-target", JAVAC_JVM_VERSION, "-source", JAVAC_JVM_VERSION), unmanagedJars in Compile <<= baseDirectory map { base => (base / "lib" ** "*.jar").classpath }, retrieveManaged := true, diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index dc205b306f0a9..a475959090297 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -35,7 +35,7 @@ ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version 0.8.0 + /__ / .__/\_,_/_/ /_/\_\ version 0.9.0-SNAPSHOT /_/ """ print "Using Python version %s (%s, %s)" % ( diff --git a/repl-bin/pom.xml b/repl-bin/pom.xml index 3685561501d67..05aadc7bdf2f3 100644 --- a/repl-bin/pom.xml +++ b/repl-bin/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/repl/pom.xml b/repl/pom.xml index a7b5e1f3c7a7a..d4b1ea10be536 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala index 31596cc02fa29..21b1ba305d110 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala @@ -24,7 +24,7 @@ trait SparkILoopInit { ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /___/ .__/\_,_/_/ /_/\_\ version 0.8.0 + /___/ .__/\_,_/_/ /_/\_\ version 0.9.0-SNAPSHOT /_/ """) import Properties._ diff --git a/streaming/pom.xml b/streaming/pom.xml index 8367256004e43..c74e76d526e7c 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index 0933c75a7fd69..2456457722406 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 47e27ee41cf80..7f852d3540d8e 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 0.8.0-SNAPSHOT + 0.9.0-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 844c707834ecf..33620101067b8 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -224,8 +224,8 @@ class Client(conf: Configuration, args: ClientArguments) extends YarnClientImpl // Add Xmx for am memory JAVA_OPTS += "-Xmx" + amMemory + "m " - JAVA_OPTS += " -Djava.io.tmpdir=" + new Path(Environment.PWD.$(), - YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + JAVA_OPTS += " -Djava.io.tmpdir=" + + new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + " " // Commenting it out for now - so that people can refer to the properties if required. Remove it once cpuset version is pushed out. @@ -241,6 +241,7 @@ class Client(conf: Configuration, args: ClientArguments) extends YarnClientImpl JAVA_OPTS += " -XX:CMSIncrementalDutyCycleMin=0 " JAVA_OPTS += " -XX:CMSIncrementalDutyCycle=10 " } + if (env.isDefinedAt("SPARK_JAVA_OPTS")) { JAVA_OPTS += env("SPARK_JAVA_OPTS") + " " } diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/WorkerRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/WorkerRunnable.scala index 6229167cb4426..a60e8a300786e 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/WorkerRunnable.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/WorkerRunnable.scala @@ -77,8 +77,9 @@ class WorkerRunnable(container: Container, conf: Configuration, masterAddress: S JAVA_OPTS += env("SPARK_JAVA_OPTS") + " " } - JAVA_OPTS += " -Djava.io.tmpdir=" + new Path(Environment.PWD.$(), - YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + JAVA_OPTS += " -Djava.io.tmpdir=" + + new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + " " + // Commenting it out for now - so that people can refer to the properties if required. Remove it once cpuset version is pushed out. // The context is, default gc for server class machines end up using all cores to do gc - hence if there are multiple containers in same