forked from intel/ipex-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.example.sh
executable file
·234 lines (214 loc) · 8.5 KB
/
run.example.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/bin/bash
#
# Licensed to Intel Corporation under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# Intel Corporation licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
BIGDL_VERSION=0.2.0-SNAPSHOT
SPARK1_DIR=spark-1.6.3-bin-hadoop2.6
SPARK1_LINK=https://www.apache.org/dist/spark/spark-1.6.3/$SPARK1_DIR.tgz
SPARK2_DIR=spark-2.0.2-bin-hadoop2.7
SPARK2_LINK=https://www.apache.org/dist/spark/spark-2.0.2/$SPARK2_DIR.tgz
SPARK_DIR=$SPARK1_DIR
SPARK_LINK=$SPARK1_LINK
CURRENT=`pwd`
BIGDL2_JAR=$HOME/.m2/repository/com/intel/analytics/bigdl/bigdl-SPARK_2.0/${BIGDL_VERSION}/bigdl-SPARK_2.0-${BIGDL_VERSION}-jar-with-dependencies.jar
BIGDL1_JAR=$HOME/.m2/repository/com/intel/analytics/bigdl/bigdl/${BIGDL_VERSION}/bigdl-${BIGDL_VERSION}-jar-with-dependencies.jar
BIGDL_JAR=$BIGDL1_JAR
DATA_DIR_MNIST=mnist
DATA_DIR_CIFAR=cifar-10-batches-bin
DATA_DIR_IMAGENET=imagenet
MODEL_DIR=model
LEARNING_RATE=0.01
MAX_EPOCH=90
ME=`basename "$0"`
options=$(getopt -o p:m:c:s:o:r:n:b:t:l:f:e:h -l spark:,model:,class:,spark-url:,cores:,memory:,nodes:,batch-size:,trained-model:,learning-rate:,hdfs-data-dir:,max-epoch:,help -- "$@")
eval set -- "$options"
while true; do
case $1 in
-p|--spark)
if [ "$2" == "spark_2.x" ]; then
SPARK_DIR=$SPARK2_DIR
SPARK_LINK=$SPARK2_LINK
BIGDL_JAR=$BIGDL2_JAR
fi
shift 2 ;;
-m|--model) MODEL=$2; shift 2 ;;
-c|--class) CLASS=$2; shift 2 ;;
-s|--spark-url) SPARK_URL=$2; shift 2 ;;
-o|--cores) CORES=$2; shift 2 ;;
-r|--memory) MEMORY=$2; shift 2 ;;
-n|--nodes) NODES=$2; shift 2 ;;
-b|--batch-size) BATCH_SIZE=$2; shift 2 ;;
-t|--trained-model) TRAINED_MODEL=$2; shift 2 ;;
-l|--learning-rate) LEARNING_RATE=$2; shift 2 ;;
-f|--hdfs-data-dir) HDFS_DATA_DIR=$2; shift 2 ;;
-e|--max-epoch) MAX_EPOCH=$2; shift 2 ;;
-h|--help)
echo "Example:"
echo "train lenet: $ME --model lenet --spark-url spark://10.0.0.1:7077 --cores 32 --memory 200g --nodes 4 --batch-size 512"
echo "train vgg: $ME --model vgg --spark-url spark://10.0.0.1:7077 --cores 32 --memory 200g --nodes 4 --batch-size 512"
echo "train inception-v1: $ME --model inception-v1 --spark-url spark://10.0.0.1:7077 --cores 32 --memory 200g --nodes 16 --batch-size 2048 --learning-rate 0.0898 --hdfs-data-dir hdfs://10.0.0.1:9000/imagenet"
echo "run performance: $ME --model perf --spark-url spark://10.0.0.1:7077 --cores 32 --memory 200g --nodes 4"
shift; exit 0;
;;
--) shift; break ;;
esac
done
if [ -d $SPARK_DIR ]; then
echo "Using existing spark dir $SPARK_DIR"
else
echo "Downloading $SPARK_DIR from $SPARK_LINK ..."
wget $SPARK_LINK && tar -xzf $SPARK_DIR.tgz
fi
if [ ! -f $BIGDL_JAR ]; then
if [ "$BIGDL_JAR" == "$BIGDL2_JAR" ]; then
mvn dependency:get -DremoteRepositories=https://oss.sonatype.org/content/groups/public/ -DgroupId=com.intel.analytics.bigdl -DartifactId=bigdl-SPARK_2.0 -Dversion=$BIGDL_VERSION -Dclassifier=jar-with-dependencies -Dtransitive=false
else
mvn dependency:get -DremoteRepositories=https://oss.sonatype.org/content/groups/public/ -DgroupId=com.intel.analytics.bigdl -DartifactId=bigdl -Dversion=$BIGDL_VERSION -Dclassifier=jar-with-dependencies -Dtransitive=false
fi
fi
[[ ! $MODEL =~ lenet|vgg|inception-v1|perf ]] && {
echo "ERROR: model must be one of lenet, vgg, inception-v1 or perf"
exit 1
}
if [ "$MODEL" == "lenet" ]; then
MODEL_DIR=lenet_model
DATA_DIR=$DATA_DIR_MNIST
if [ -d $DATA_DIR ]; then
echo "Using existing data dir $DATA_DIR"
else
echo "Downloading mnist from http://yann.lecun.com/exdb/mnist/ ..."
mkdir $DATA_DIR
cd $DATA_DIR
wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz && gunzip train-images-idx3-ubyte.gz
wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz && gunzip train-labels-idx1-ubyte.gz
wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz && gunzip t10k-images-idx3-ubyte.gz
wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz && gunzip t10k-labels-idx1-ubyte.gz
cd ../
fi
elif [ "$MODEL" == "vgg" ]; then
MODEL_DIR=vgg_model
DATA_DIR=$DATA_DIR_CIFAR
if [ -d $DATA_DIR ]; then
echo "Using existing data dir $DATA_DIR"
else
echo "Downloading mnist from https://www.cs.toronto.edu/~kriz/cifar.html ..."
wget https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz && tar -xzf cifar-10-binary.tar.gz
fi
elif [ "$MODEL" == "inception-v1" ]; then
MODEL_DIR=inception_model
# check batch_size
if [ "$BATCH_SIZE" == "" ]; then
echo "ERROR: inceptionv1 model need to set batch_size"
exit 1
else
remainder=$((BATCH_SIZE % (NODES*CORES)))
if [ $remainder -ne 0 ]; then
echo "ERROR: batch_size must be a multiple of 'nodes*cores'"
exit 1
fi
fi
DATA_DIR=$DATA_DIR_IMAGENET
if [ -d $DATA_DIR/train ] && [ -d $DATA_DIR/val ]; then
echo "Using existing data dir $DATA_DIR"
HDFS_HOST=`echo $HDFS_DATA_DIR | awk -F '/' '{print $3}' | awk -F':' '{print $1}'`
HADOOP_HOME=$HOME/hadoop-2.7.3
if [ ! -d "hadoop-2.7.3" ]; then
wget http://apache.claz.org/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz
tar -xzf hadoop-2.7.3.tar.gz
cat << EOF > $HADOOP_HOME/etc/hadoop/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://$HDFS_HOST:9000</value>
</property>
</configuration>
EOF
fi
$HADOOP_HOME/bin/hadoop fs -mkdir -p $HDFS_DATA_DIR
$HADOOP_HOME/bin/hadoop fs -copyFromLocal $DATA_DIR/* $HDFS_DATA_DIR
else
echo "Using HDFS data input: $HDFS_DATA_DIR"
fi
fi
if [ -d $MODEL_DIR ]; then
echo "Using existing model output dir $MODEL_DIR"
else
echo "Making new dir for model output $MODEL_DIR"
mkdir $MODEL_DIR
fi
[[ ! $CLASS =~ train|test ]] && {
CLASS=train
}
cd $CURRENT
if [ "$MODEL" == "lenet" ] || [ "$MODEL" == "vgg" ]; then
if [ "$CLASS" == "train" ]; then
./$SPARK_DIR/bin/spark-submit \
--master $SPARK_URL \
--total-executor-cores $(($CORES * $NODES)) \
--executor-cores $CORES \
--driver-cores $CORES \
--driver-memory $MEMORY \
--executor-memory $MEMORY \
--num-executors $NODES \
--class com.intel.analytics.bigdl.models.$MODEL.Train $BIGDL_JAR -f $DATA_DIR/ -b $BATCH_SIZE --maxEpoch $MAX_EPOCH --overWrite --checkpoint $MODEL_DIR
else
./$SPARK_DIR/bin/spark-submit \
--master $SPARK_URL \
--total-executor-cores $(($CORES * $NODES)) \
--executor-cores $CORES \
--driver-cores $CORES \
--driver-memory $MEMORY \
--executor-memory $MEMORY \
--num-executors $NODES \
--class com.intel.analytics.bigdl.models.$MODEL.Test $BIGDL_JAR -f $DATA_DIR/ --model $TRAINED_MODEL -b $BATCH_SIZE
fi
elif [ "$MODEL" == "inception-v1" ]; then
#echo $SPARK_URL
#echo $BIGDL_JAR
if [ "$CLASS" == "train" ]; then
./$SPARK_DIR/bin/spark-submit \
--master $SPARK_URL \
--total-executor-cores $(($CORES * $NODES)) \
--executor-cores $CORES \
--driver-cores $CORES \
--driver-memory $MEMORY \
--executor-memory $MEMORY \
--num-executors $NODES \
--driver-class-path $BIGDL_JAR \
--class com.intel.analytics.bigdl.models.inception.TrainInceptionV1 $BIGDL_JAR --batchSize $BATCH_SIZE --maxEpoch $MAX_EPOCH --overWrite --learningRate $LEARNING_RATE -f $HDFS_DATA_DIR --checkpoint $MODEL_DIR
else
./$SPARK_DIR/bin/spark-submit \
--master $SPARK_URL \
--driver-cores $CORES \
--driver-memory $MEMORY \
--total-executor-cores $(($CORES * $NODES)) \
--executor-cores $CORES \
--executor-memory $MEMORY \
--num-executors $NODES \
--driver-class-path $BIGDL_JAR \
--class com.intel.analytics.bigdl.models.inception.Test $BIGDL_JAR --batchSize $BATCH_SIZE -f $HDFS_DATA_DIR/val
fi
elif [ "$MODEL" == "perf" ]; then
./$SPARK_DIR/bin/spark-submit \
--master $SPARK_URL \
--driver-cores $CORES \
--driver-memory $MEMORY \
--total-executor-cores $(($CORES * $NODES)) \
--executor-cores $CORES \
--executor-memory $MEMORY \
--num-executors $NODES \
--class com.intel.analytics.bigdl.models.utils.DistriOptimizerPerf $BIGDL_JAR -m inception_v1 --maxEpoch $MAX_EPOCH
fi