From 48978abfa4d8f2cf79a4b053cc8bc7254cc2d61b Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 15 Mar 2016 09:44:48 -0700 Subject: [PATCH] [SPARK-13576][BUILD] Don't create assembly for examples. As part of the goal to stop creating assemblies in Spark, this change modifies the mvn and sbt builds to not create an assembly for examples. Instead, dependencies are copied to the build directory (under target/scala-xx/jars), and in the final archive, into the "examples/jars" directory. To avoid having to deal too much with Windows batch files, I made examples run through the launcher library; the spark-submit launcher now has a special mode to run examples, which adds all the necessary jars to the spark-submit command line, and replaces the bash and batch scripts that were used to run examples. The scripts are now just a thin wrapper around spark-submit; another advantage is that now all spark-submit options are supported. There are a few glitches; in the mvn build, a lot of duplicated dependencies get copied, because they are promoted to "compile" scope due to extra dependencies in the examples module (such as HBase). In the sbt build, all dependencies are copied, because there doesn't seem to be an easy way to filter things. I plan to clean some of this up when the rest of the tasks are finished. When the main assembly is replaced with jars, we can remove duplicate jars from the examples directory during packaging. Tested by running SparkPi in: maven build, sbt build, dist created by make-distribution.sh. Finally: note that running the "assembly" target in sbt doesn't build the examples anymore. You need to run "package" for that. Author: Marcelo Vanzin Closes #11452 from vanzin/SPARK-13576. --- bin/run-example | 55 +----------- bin/run-example.cmd | 7 +- bin/run-example2.cmd | 85 ------------------- dev/make-distribution.sh | 5 +- examples/pom.xml | 54 ++++++------ .../launcher/SparkSubmitCommandBuilder.java | 68 +++++++++++++-- .../SparkSubmitCommandBuilderSuite.java | 18 ++++ pom.xml | 3 + project/SparkBuild.scala | 41 ++++++++- 9 files changed, 157 insertions(+), 179 deletions(-) delete mode 100644 bin/run-example2.cmd diff --git a/bin/run-example b/bin/run-example index e1b0d5789bed6..dd0e3c4120260 100755 --- a/bin/run-example +++ b/bin/run-example @@ -21,56 +21,5 @@ if [ -z "${SPARK_HOME}" ]; then export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" fi -EXAMPLES_DIR="${SPARK_HOME}"/examples - -. "${SPARK_HOME}"/bin/load-spark-env.sh - -if [ -n "$1" ]; then - EXAMPLE_CLASS="$1" - shift -else - echo "Usage: ./bin/run-example [example-args]" 1>&2 - echo " - set MASTER=XX to use a specific master" 1>&2 - echo " - can use abbreviated example class name relative to com.apache.spark.examples" 1>&2 - echo " (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)" 1>&2 - exit 1 -fi - -if [ -f "${SPARK_HOME}/RELEASE" ]; then - JAR_PATH="${SPARK_HOME}/lib" -else - JAR_PATH="${EXAMPLES_DIR}/target/scala-${SPARK_SCALA_VERSION}" -fi - -JAR_COUNT=0 - -for f in "${JAR_PATH}"/spark-examples-*hadoop*.jar; do - if [[ ! -e "$f" ]]; then - echo "Failed to find Spark examples assembly in ${SPARK_HOME}/lib or ${SPARK_HOME}/examples/target" 1>&2 - echo "You need to build Spark before running this program" 1>&2 - exit 1 - fi - SPARK_EXAMPLES_JAR="$f" - JAR_COUNT=$((JAR_COUNT+1)) -done - -if [ "$JAR_COUNT" -gt "1" ]; then - echo "Found multiple Spark examples assembly jars in ${JAR_PATH}" 1>&2 - ls "${JAR_PATH}"/spark-examples-*hadoop*.jar 1>&2 - echo "Please remove all but one jar." 1>&2 - exit 1 -fi - -export SPARK_EXAMPLES_JAR - -EXAMPLE_MASTER=${MASTER:-"local[*]"} - -if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then - EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS" -fi - -exec "${SPARK_HOME}"/bin/spark-submit \ - --master $EXAMPLE_MASTER \ - --class $EXAMPLE_CLASS \ - "$SPARK_EXAMPLES_JAR" \ - "$@" +export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]" +exec "${SPARK_HOME}"/bin/spark-submit run-example "$@" diff --git a/bin/run-example.cmd b/bin/run-example.cmd index 64f6bc3728d07..f9b786e92b823 100644 --- a/bin/run-example.cmd +++ b/bin/run-example.cmd @@ -17,7 +17,6 @@ rem See the License for the specific language governing permissions and rem limitations under the License. rem -rem This is the entry point for running a Spark example. To avoid polluting -rem the environment, it just launches a new cmd to do the real work. - -cmd /V /E /C "%~dp0run-example2.cmd" %* +set SPARK_HOME=%~dp0.. +set _SPARK_CMD_USAGE=Usage: ./bin/run-example [options] example-class [example args] +cmd /V /E /C "%~dp0spark-submit.cmd" run-example %* diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd deleted file mode 100644 index fada43581d184..0000000000000 --- a/bin/run-example2.cmd +++ /dev/null @@ -1,85 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -set SCALA_VERSION=2.10 - -rem Figure out where the Spark framework is installed -set SPARK_HOME=%~dp0.. - -call "%SPARK_HOME%\bin\load-spark-env.cmd" - -rem Test that an argument was given -if not "x%1"=="x" goto arg_given - echo Usage: run-example ^ [example-args] - echo - set MASTER=XX to use a specific master - echo - can use abbreviated example class name relative to com.apache.spark.examples - echo (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL) - goto exit -:arg_given - -set EXAMPLES_DIR=%SPARK_HOME%\examples - -rem Figure out the JAR file that our examples were packaged into. -set SPARK_EXAMPLES_JAR= -if exist "%SPARK_HOME%\RELEASE" ( - for %%d in ("%SPARK_HOME%\lib\spark-examples*.jar") do ( - set SPARK_EXAMPLES_JAR=%%d - ) -) else ( - for %%d in ("%EXAMPLES_DIR%\target\scala-%SCALA_VERSION%\spark-examples*.jar") do ( - set SPARK_EXAMPLES_JAR=%%d - ) -) -if "x%SPARK_EXAMPLES_JAR%"=="x" ( - echo Failed to find Spark examples assembly JAR. - echo You need to build Spark before running this program. - goto exit -) - -rem Set master from MASTER environment variable if given -if "x%MASTER%"=="x" ( - set EXAMPLE_MASTER=local[*] -) else ( - set EXAMPLE_MASTER=%MASTER% -) - -rem If the EXAMPLE_CLASS does not start with org.apache.spark.examples, add that -set EXAMPLE_CLASS=%1 -set PREFIX=%EXAMPLE_CLASS:~0,25% -if not %PREFIX%==org.apache.spark.examples ( - set EXAMPLE_CLASS=org.apache.spark.examples.%EXAMPLE_CLASS% -) - -rem Get the tail of the argument list, to skip the first one. This is surprisingly -rem complicated on Windows. -set "ARGS=" -:top -shift -if "%~1" neq "" ( - set ARGS=%ARGS% "%~1" - goto :top -) -if defined ARGS set ARGS=%ARGS:~1% - -call "%SPARK_HOME%\bin\spark-submit.cmd" ^ - --master %EXAMPLE_MASTER% ^ - --class %EXAMPLE_CLASS% ^ - "%SPARK_EXAMPLES_JAR%" %ARGS% - -:exit diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index ac4e9b90f0177..dbdd42ff9e087 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -166,11 +166,14 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE" # Copy jars cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/" -cp "$SPARK_HOME"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/" # This will fail if the -Pyarn profile is not provided # In this case, silence the error and ignore the return code of this command cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || : +# Copy examples and dependencies +mkdir -p "$DISTDIR/examples/jars" +cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars" + # Copy example sources (needed for python and SQL) mkdir -p "$DISTDIR/examples/src/main" cp -r "$SPARK_HOME"/examples/src/main "$DISTDIR/examples/src/" diff --git a/examples/pom.xml b/examples/pom.xml index 92bb373c7382d..1aa730c0dcdac 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -322,36 +322,36 @@ org.apache.maven.plugins - maven-shade-plugin + maven-jar-plugin + + + prepare-test-jar + none + + test-jar + + + - false - ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar - - - *:* - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - reference.conf - - - log4j.properties - - + ${jars.target.dir} + + org.apache.maven.plugins + maven-dependency-plugin + + + package + + copy-dependencies + + + runtime + ${jars.target.dir} + + + + diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java index b2dd6ac4c3982..56e4107c5a0c7 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java @@ -30,7 +30,8 @@ * driver-side options and special parsing behavior needed for the special-casing certain internal * Spark applications. *

- * This class has also some special features to aid launching pyspark. + * This class has also some special features to aid launching shells (pyspark and sparkR) and also + * examples. */ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { @@ -62,6 +63,17 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { */ static final String SPARKR_SHELL_RESOURCE = "sparkr-shell"; + /** + * Name of app resource used to identify examples. When running examples, args[0] should be + * this name. The app resource will identify the example class to run. + */ + static final String RUN_EXAMPLE = "run-example"; + + /** + * Prefix for example class names. + */ + static final String EXAMPLE_CLASS_PREFIX = "org.apache.spark.examples."; + /** * This map must match the class names for available special classes, since this modifies the way * command line parsing works. This maps the class name to the resource to use when calling @@ -78,6 +90,7 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { final List sparkArgs; private final boolean printInfo; + private final boolean isExample; /** * Controls whether mixing spark-submit arguments with app arguments is allowed. This is needed @@ -89,10 +102,13 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { SparkSubmitCommandBuilder() { this.sparkArgs = new ArrayList<>(); this.printInfo = false; + this.isExample = false; } SparkSubmitCommandBuilder(List args) { - this.sparkArgs = new ArrayList<>(); + this.allowsMixedArguments = false; + + boolean isExample = false; List submitArgs = args; if (args.size() > 0 && args.get(0).equals(PYSPARK_SHELL)) { this.allowsMixedArguments = true; @@ -102,10 +118,14 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { this.allowsMixedArguments = true; appResource = SPARKR_SHELL_RESOURCE; submitArgs = args.subList(1, args.size()); - } else { - this.allowsMixedArguments = false; + } else if (args.size() > 0 && args.get(0).equals(RUN_EXAMPLE)) { + isExample = true; + submitArgs = args.subList(1, args.size()); } + this.sparkArgs = new ArrayList<>(); + this.isExample = isExample; + OptionParser parser = new OptionParser(); parser.parse(submitArgs); this.printInfo = parser.infoRequested; @@ -155,6 +175,10 @@ List buildSparkSubmitArgs() { args.add(propertiesFile); } + if (isExample) { + jars.addAll(findExamplesJars()); + } + if (!jars.isEmpty()) { args.add(parser.JARS); args.add(join(",", jars)); @@ -170,6 +194,9 @@ List buildSparkSubmitArgs() { args.add(join(",", pyFiles)); } + if (!printInfo) { + checkArgument(!isExample || mainClass != null, "Missing example class name."); + } if (mainClass != null) { args.add(parser.CLASS); args.add(mainClass); @@ -308,6 +335,25 @@ private boolean isThriftServer(String mainClass) { mainClass.equals("org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")); } + private List findExamplesJars() { + List examplesJars = new ArrayList<>(); + String sparkHome = getSparkHome(); + + File jarsDir; + if (new File(sparkHome, "RELEASE").isFile()) { + jarsDir = new File(sparkHome, "examples/jars"); + } else { + jarsDir = new File(sparkHome, + String.format("examples/target/scala-%s/jars", getScalaVersion())); + } + checkState(jarsDir.isDirectory(), "Examples jars directory '%s' does not exist.", + jarsDir.getAbsolutePath()); + + for (File f: jarsDir.listFiles()) { + examplesJars.add(f.getAbsolutePath()); + } + return examplesJars; + } private class OptionParser extends SparkSubmitOptionParser { @@ -367,6 +413,14 @@ protected boolean handleUnknown(String opt) { if (allowsMixedArguments) { appArgs.add(opt); return true; + } else if (isExample) { + String className = opt; + if (!className.startsWith(EXAMPLE_CLASS_PREFIX)) { + className = EXAMPLE_CLASS_PREFIX + className; + } + mainClass = className; + appResource = "spark-internal"; + return false; } else { checkArgument(!opt.startsWith("-"), "Unrecognized option: %s", opt); sparkArgs.add(opt); @@ -376,8 +430,10 @@ protected boolean handleUnknown(String opt) { @Override protected void handleExtraArgs(List extra) { - for (String arg : extra) { - sparkArgs.add(arg); + if (isExample) { + appArgs.addAll(extra); + } else { + sparkArgs.addAll(extra); } } diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java index 00f967122bd70..b7f4f2efc5d84 100644 --- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java +++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java @@ -151,6 +151,24 @@ public void testPySparkFallback() throws Exception { assertEquals("arg1", cmd.get(cmd.size() - 1)); } + @Test + public void testExamplesRunner() throws Exception { + List sparkSubmitArgs = Arrays.asList( + SparkSubmitCommandBuilder.RUN_EXAMPLE, + parser.MASTER + "=foo", + parser.DEPLOY_MODE + "=bar", + "SparkPi", + "42"); + + Map env = new HashMap(); + List cmd = buildCommand(sparkSubmitArgs, env); + assertEquals("foo", findArgValue(cmd, parser.MASTER)); + assertEquals("bar", findArgValue(cmd, parser.DEPLOY_MODE)); + assertEquals(SparkSubmitCommandBuilder.EXAMPLE_CLASS_PREFIX + "SparkPi", + findArgValue(cmd, parser.CLASS)); + assertEquals("42", cmd.get(cmd.size() - 1)); + } + private void testCmdBuilder(boolean isDriver, boolean useDefaultPropertyFile) throws Exception { String deployMode = isDriver ? "client" : "cluster"; diff --git a/pom.xml b/pom.xml index 0faa691c5e78b..92a32e7797bbc 100644 --- a/pom.xml +++ b/pom.xml @@ -178,6 +178,9 @@ ${java.home} + + ${project.build.directory}/scala-${scala.binary.version}/jars +