Skip to content

Commit

Permalink
Merge branch 'issue#7' into 'master'
Browse files Browse the repository at this point in the history
Fixing Issue#7

move parameters to be set into conf/env.sh

See merge request !11
  • Loading branch information
kiszk committed Jun 2, 2015
2 parents 14b9f97 + e442c46 commit 7609228
Show file tree
Hide file tree
Showing 70 changed files with 851 additions and 555 deletions.
59 changes: 34 additions & 25 deletions ConnectedComponent/bin/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,42 @@ if [ ${COMPRESS_GLOBAL} -eq 1 ]; then
OUTPUT_HDFS=${OUTPUT_HDFS}-comp
fi

# application parameters
numV=50000
numPar=200
mu=4.0
sigma=1.3
NUM_TRIALS=1

# Either stand alone or yarn cluster
APP_MASTER=${SPARK_MASTER}
#APP_MASTER=yarn-cluster
nexe=60
dmem=1g
ecore=1
emem=6g
[ -n "$EXECUTOR_GLOBAL_MEM" ] && emem=$EXECUTOR_GLOBAL_MEM
memoryFraction=0.5
[ -n "$MEM_FRACTION_GLOBAL" ] && memoryFraction=${MEM_FRACTION_GLOBAL}
rdd_compression=false
spark_ser=KryoSerializer
rddcodec=lzf
SPARK_OPT="--conf spark.storage.memoryFraction=${memoryFraction} \
--conf spark.executor.memory=${emem} \
--conf spark.serializer=org.apache.spark.serializer.${spark_ser} \
--conf spark.rdd.compress=${rdd_compression} \
--conf spark.io.compression.codec=${rddcodec} \
--conf spark.default.parallelism=${num_task} "
#YARN_OPT="--num-executors $nexe --driver-memory $dmem --executor-memory $emem --executor-cores $ecore"

SPARK_OPT=
if [ ! -z "$SPARK_STORAGE_MEMORYFRACTION" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.storage.memoryFraction=${SPARK_STORAGE_MEMORYFRACTION}"
fi
if [ ! -z "$SPARK_EXECUTOR_MEMORY" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.executor.memory=${SPARK_EXECUTOR_MEMORY}"
fi
if [ ! -z "$SPARK_SERIALIZER" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.serializer=${SPARK_SERIALIZER}"
fi
if [ ! -z "$SPARK_RDD_COMPRESS" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.rdd.compress=${SPARK_RDD_COMPRESS}"
fi
if [ ! -z "$SPARK_IO_COMPRESSION_CODEC" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.io.compression.codec=${SPARK_IO_COMPRESSION_CODEC}"
fi
if [ ! -z "$SPARK_DEFAULT_PARALLELISM" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.default.parallelism=${SPARK_DEFAULT_PARALLELISM}"
fi

YARN_OPT=
if [ "$MASTER" = "yarn" ]; then
if [ ! -z "$SPARK_EXECUTOR_INSTANCES" ]; then
YARN_OPT="${YARN_OPT} --num-executors ${SPARK_EXECUTOR_INSTANCES}"
fi
if [ ! -z "$SPARK_EXECUTOR_CORES" ]; then
YARN_OPT="${YARN_OPT} --executor-cores ${SPARK_EXECUTOR_CORES}"
fi
if [ ! -z "$SPARK_DRIVER_MEMORY" ]; then
YARN_OPT="${YARN_OPT} --driver-memory ${SPARK_DRIVER_MEMORY}"
fi
fi


#input benreport
function print_config(){
Expand Down
2 changes: 1 addition & 1 deletion ConnectedComponent/bin/gen_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ JAR="${DIR}/../common/DataGen/target/scala-2.10/datagen_2.10-1.0.jar"
CLASS="src.main.scala.GraphDataGen"
OPTION="${INOUT_SCHEME}${INPUT_HDFS} ${numV} ${numPar} ${mu} ${sigma}"

START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts

setup
START_TIME=`timestamp`
Expand Down
2 changes: 1 addition & 1 deletion ConnectedComponent/bin/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for((i=0;i<${NUM_TRIALS};i++)); do

${RM} -r ${OUTPUT_HDFS}
purge_data "${MC_LIST}"
START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts
START_TIME=`timestamp`
exec ${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${APP_MASTER} ${YARN_OPT} ${SPARK_OPT} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/${APP}_run_${START_TS}.dat
END_TIME=`timestamp`
Expand Down
7 changes: 7 additions & 0 deletions ConnectedComponent/conf/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# application parameters
numV=50000
numPar=-1
mu=4.0
sigma=1.3

SPARK_STORAGE_MEMORYFRACTION=0.5
81 changes: 33 additions & 48 deletions DecisionTree/bin/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,56 +13,41 @@ INPUT_HDFS=${DATA_HDFS}/${APP}/Input
OUTPUT_HDFS_Classification=${DATA_HDFS}/${APP}/Output-Classification
OUTPUT_HDFS_Regression=${DATA_HDFS}/${APP}/Output-Classification



# for gen_data.sh; 200M data size = 1 million points
NUM_OF_EXAMPLES=5000 #00000
NUM_OF_PARTITIONS=120 #0
memoryFraction=0.01
[ -n "$MEM_FRACTION_GLOBAL" ] && memoryFraction=${MEM_FRACTION_GLOBAL}
NUM_OF_FEATURES=6

# for run.sh
NUM_OF_CLASS_C=10
impurityC="gini"
maxDepthC=5
maxBinsC=100
modeC="Classification"
#${NUM_OF_CLASS_C} ${impurityC} ${maxDepthC} ${maxBinsC} ${modeC}

NUM_OF_CLASS_R=10
impurityR="variance"
maxDepthR=5
maxBinsR=100
modeR="Regression"
#${NUM_OF_CLASS_R} ${impurityR} ${maxDepthR} ${maxBinsR} ${modeR}

MAX_ITERATION=3
NUM_TRIALS=1


# either stand alone or yarn cluster
APP_MASTER=${SPARK_MASTER}
#APP_MASTER=yarn-cluster
num_task=20
nexe=60
dmem=1024m
ecore=1
emem=1024m
[ -n "$EXECUTOR_GLOBAL_MEM" ] && emem=$EXECUTOR_GLOBAL_MEM
memoryFraction=0.79
[ -n "$MEM_FRACTION_GLOBAL" ] && memoryFraction=${MEM_FRACTION_GLOBAL}
rdd_compression=false
spark_ser=KryoSerializer
rddcodec=lzf
SPARK_OPT="--conf spark.storage.memoryFraction=${memoryFraction} \
--conf spark.executor.memory=${emem} \
--conf spark.serializer=org.apache.spark.serializer.${spark_ser} \
--conf spark.rdd.compress=${rdd_compression} \
--conf spark.io.compression.codec=${rddcodec} \
--conf spark.default.parallelism=${num_task} "
#YARN_OPT="--num-executors $nexe --driver-memory $dmem \
#--executor-memory $emem --executor-cores $ecore"

SPARK_OPT=
if [ ! -z "$SPARK_STORAGE_MEMORYFRACTION" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.storage.memoryFraction=${SPARK_STORAGE_MEMORYFRACTION}"
fi
if [ ! -z "$SPARK_EXECUTOR_MEMORY" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.executor.memory=${SPARK_EXECUTOR_MEMORY}"
fi
if [ ! -z "$SPARK_SERIALIZER" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.serializer=${SPARK_SERIALIZER}"
fi
if [ ! -z "$SPARK_RDD_COMPRESS" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.rdd.compress=${SPARK_RDD_COMPRESS}"
fi
if [ ! -z "$SPARK_IO_COMPRESSION_CODEC" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.io.compression.codec=${SPARK_IO_COMPRESSION_CODEC}"
fi
if [ ! -z "$SPARK_DEFAULT_PARALLELISM" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.default.parallelism=${SPARK_DEFAULT_PARALLELISM}"
fi

YARN_OPT=
if [ "$MASTER" = "yarn" ]; then
if [ ! -z "$SPARK_EXECUTOR_INSTANCES" ]; then
YARN_OPT="${YARN_OPT} --num-executors ${SPARK_EXECUTOR_INSTANCES}"
fi
if [ ! -z "$SPARK_EXECUTOR_CORES" ]; then
YARN_OPT="${YARN_OPT} --executor-cores ${SPARK_EXECUTOR_CORES}"
fi
if [ ! -z "$SPARK_DRIVER_MEMORY" ]; then
YARN_OPT="${YARN_OPT} --driver-memory ${SPARK_DRIVER_MEMORY}"
fi
fi


#input benreport
Expand Down
5 changes: 2 additions & 3 deletions DecisionTree/bin/gen_data.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash


# configure
bin=`dirname "$0"`
bin=`cd "$bin"; pwd`
Expand All @@ -21,10 +20,10 @@ JAR="${MllibJar}"
CLASS="org.apache.spark.mllib.util.SVMDataGenerator"
OPTION=" ${SPARK_MASTER} ${INOUT_SCHEME}${INPUT_HDFS} ${NUM_OF_EXAMPLES} ${NUM_OF_FEATURES} ${NUM_OF_PARTITIONS} "

START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts
setup
START_TIME=`timestamp`
exec ${SPARK_HOME}/bin/spark-submit --class $CLASS $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/DecisionTree_gendata_${START_TS}.dat
exec ${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${APP_MASTER} ${YARN_OPT} ${SPARK_OPT} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/DecisionTree_gendata_${START_TS}.dat
END_TIME=`timestamp`

SIZE=`${DU} -s ${INPUT_HDFS} | awk '{ print $1 }'`
Expand Down
4 changes: 2 additions & 2 deletions DecisionTree/bin/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ for((i=0;i<${NUM_TRIALS};i++)); do
${RM} -r ${OUTPUT_HDFS_Classification}
purge_data "${MC_LIST}"
OPTION=" ${INOUT_SCHEME}${INPUT_HDFS} ${INOUT_SCHEME}${OUTPUT_HDFS_Classification} ${NUM_OF_CLASS_C} ${impurityC} ${maxDepthC} ${maxBinsC} ${modeC}"
START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts
START_TIME=`timestamp`
exec ${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${SPARK_MASTER} --conf spark.storage.memoryFraction=${memoryFraction} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/DecisionTree_run_${START_TS}.dat
END_TIME=`timestamp`
Expand All @@ -41,7 +41,7 @@ if [ 1 -eq 0 ]; then
purge_data "${MC_LIST}"
OPTION=" ${INOUT_SCHEME}${INPUT_HDFS} ${INOUT_SCHEME}${OUTPUT_HDFS_Regression} ${NUM_OF_CLASS_R} ${impurityR} ${maxDepthR} ${maxBinsR} ${modeR} "
START_TIME=`timestamp`
START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts
exec ${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${SPARK_MASTER} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/DecisionTree_run_${START_TS}.dat
END_TIME=`timestamp`
gen_report "DecisionTree-regression" ${START_TIME} ${END_TIME} ${SIZE} ${START_TS}>> ${BENCH_REPORT}
Expand Down
25 changes: 25 additions & 0 deletions DecisionTree/conf/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# for gen_data.sh; 200M data size = 1 million points
NUM_OF_EXAMPLES=5000 #00000
NUM_OF_PARTITIONS=120 #0
memoryFraction=0.01
[ -n "$MEM_FRACTION_GLOBAL" ] && memoryFraction=${MEM_FRACTION_GLOBAL}
NUM_OF_FEATURES=6

# for run.sh
NUM_OF_CLASS_C=10
impurityC="gini"
maxDepthC=5
maxBinsC=100
modeC="Classification"
#${NUM_OF_CLASS_C} ${impurityC} ${maxDepthC} ${maxBinsC} ${modeC}

NUM_OF_CLASS_R=10
impurityR="variance"
maxDepthR=5
maxBinsR=100
modeR="Regression"
#${NUM_OF_CLASS_R} ${impurityR} ${maxDepthR} ${maxBinsR} ${modeR}

MAX_ITERATION=3

SPARK_STORAGE_MEMORYFRACTION=0.79
65 changes: 32 additions & 33 deletions KMeans/bin/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,42 +17,41 @@ if [ ${COMPRESS_GLOBAL} -eq 1 ]; then
OUTPUT_HDFS=${OUTPUT_HDFS}-comp
fi

# for prepare
NUM_OF_POINTS=1000
NUM_OF_CLUSTERS=10
DIMENSIONS=20
SCALING=0.6
NUM_OF_PARTITION=10
#NUM_OF_SAMPLES=20000000
#SAMPLES_PER_INPUTFILE=4000000
#SAMPLES_PER_INPUTFILE=6000000
MAX_ITERATION=5
NUM_RUN=1
NUM_TRIALS=1


# either stand alone or yarn cluster
# Either stand alone or yarn cluster
APP_MASTER=${SPARK_MASTER}
#APP_MASTER=yarn-cluster
numPar=${NUM_OF_PARTITION}
nexe=10
dmem=1g
emem=1g
[ -n "$EXECUTOR_GLOBAL_MEM" ] && emem=$EXECUTOR_GLOBAL_MEM
ecore=6
memoryFraction=0.48
#0.001 rdd=0
[ -n "$MEM_FRACTION_GLOBAL" ] && memoryFraction=${MEM_FRACTION_GLOBAL}

rdd_compression=false
spark_ser=KryoSerializer
rddcodec=lzf
SPARK_OPT="--conf spark.storage.memoryFraction=${memoryFraction} --conf spark.executor.memory=${emem} --conf spark.serializer=org.apache.spark.serializer.${spark_ser} --conf spark.rdd.compress=${rdd_compression} --conf spark.io.compression.codec=${rddcodec} --conf spark.default.parallelism=${numPar}"

SPARK_OPT=
if [ ! -z "$SPARK_STORAGE_MEMORYFRACTION" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.storage.memoryFraction=${SPARK_STORAGE_MEMORYFRACTION}"
fi
if [ ! -z "$SPARK_EXECUTOR_MEMORY" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.executor.memory=${SPARK_EXECUTOR_MEMORY}"
fi
if [ ! -z "$SPARK_SERIALIZER" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.serializer=${SPARK_SERIALIZER}"
fi
if [ ! -z "$SPARK_RDD_COMPRESS" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.rdd.compress=${SPARK_RDD_COMPRESS}"
fi
if [ ! -z "$SPARK_IO_COMPRESSION_CODEC" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.io.compression.codec=${SPARK_IO_COMPRESSION_CODEC}"
fi
if [ ! -z "$SPARK_DEFAULT_PARALLELISM" ]; then
SPARK_OPT="${SPARK_OPT} --conf spark.default.parallelism=${SPARK_DEFAULT_PARALLELISM}"
fi

YARN_OPT=""
#YARN_OPT="--num-executors $nexe --driver-memory $dmem --executor-memory $emem --executor-cores $ecore"

YARN_OPT=
if [ "$MASTER" = "yarn" ]; then
if [ ! -z "$SPARK_EXECUTOR_INSTANCES" ]; then
YARN_OPT="${YARN_OPT} --num-executors ${SPARK_EXECUTOR_INSTANCES}"
fi
if [ ! -z "$SPARK_EXECUTOR_CORES" ]; then
YARN_OPT="${YARN_OPT} --executor-cores ${SPARK_EXECUTOR_CORES}"
fi
if [ ! -z "$SPARK_DRIVER_MEMORY" ]; then
YARN_OPT="${YARN_OPT} --driver-memory ${SPARK_DRIVER_MEMORY}"
fi
fi


function print_config(){
Expand Down
7 changes: 4 additions & 3 deletions KMeans/bin/gen_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,17 @@ CLASS="kmeans_min.src.main.scala.KmeansDataGen"
OPTION="${NUM_OF_POINTS} ${NUM_OF_CLUSTERS} ${DIMENSIONS} ${SCALING} ${NUM_OF_PARTITION} ${INOUT_SCHEME}${INPUT_HDFS}"


START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts
setup
START_TIME=`timestamp`
echo "${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${APP_MASTER} ${YARN_OPT} ${SPARK_OPT} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/TEMP_gendata_${START_TS}.dat"
exec ${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${APP_MASTER} ${YARN_OPT} ${SPARK_OPT} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/TEMP_gendata_${START_TS}.dat

END_TIME=`timestamp`
SIZE=`${DU} -s ${INPUT_HDFS} | awk '{ print $1 }'`

gen_report "${APP}-gendata" ${START_TIME} ${END_TIME} ${SIZE} ${START_TS}>> ${BENCH_REPORT}
print_config ${BENCH_REPORT}
#gen_report "${APP}-gendata" ${START_TIME} ${END_TIME} ${SIZE} ${START_TS}>> ${BENCH_REPORT}
#print_config ${BENCH_REPORT}
teardown

exit 0
Expand Down
2 changes: 1 addition & 1 deletion KMeans/bin/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ setup
for((i=0;i<${NUM_TRIALS};i++)); do
${RM} -r ${OUTPUT_HDFS}
purge_data "${MC_LIST}"
START_TS=`ssh ${master} "date +%F-%T"`
START_TS=get_start_ts
START_TIME=`timestamp`

exec ${SPARK_HOME}/bin/spark-submit --class $CLASS --master ${APP_MASTER} ${YARN_OPT} ${SPARK_OPT} $JAR ${OPTION} 2>&1|tee ${BENCH_NUM}/${APP}_run_${START_TS}.dat
Expand Down
13 changes: 13 additions & 0 deletions KMeans/conf/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# for prepare
NUM_OF_POINTS=1000
NUM_OF_CLUSTERS=10
DIMENSIONS=20
SCALING=0.6
NUM_OF_PARTITION=10
#NUM_OF_SAMPLES=20000000
#SAMPLES_PER_INPUTFILE=4000000
#SAMPLES_PER_INPUTFILE=6000000
MAX_ITERATION=5
NUM_RUN=1

SPARK_STORAGE_MEMORYFRACTION=0.48
Loading

0 comments on commit 7609228

Please sign in to comment.