Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#1227 from helinwang/k8s_aws
Browse files Browse the repository at this point in the history
paddle on aws with kubernetes tutorial now works
  • Loading branch information
helinwang authored Jan 26, 2017
2 parents b1f09f2 + 17867fb commit f1a8f7a
Show file tree
Hide file tree
Showing 12 changed files with 331 additions and 362 deletions.
559 changes: 255 additions & 304 deletions doc/howto/usage/k8s/k8s_aws_en.md

Large diffs are not rendered by default.

Binary file modified doc/howto/usage/k8s/src/add_security_group.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified doc/howto/usage/k8s/src/create_efs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
43 changes: 0 additions & 43 deletions doc/howto/usage/k8s/src/job.yaml

This file was deleted.

7 changes: 7 additions & 0 deletions doc/howto/usage/k8s/src/k8s_data/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM alpine

RUN apk update && apk upgrade && apk add coreutils
ADD quick_start /quick_start
ADD get_data.sh /bin/
RUN chmod +x /bin/get_data.sh
ENTRYPOINT ["/bin/get_data.sh"]
6 changes: 6 additions & 0 deletions doc/howto/usage/k8s/src/k8s_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:

```
cp -r ../../../../../../demo/quick_start .
docker build . -t prepare-data-image-name
```
26 changes: 26 additions & 0 deletions doc/howto/usage/k8s/src/k8s_data/get_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/sh

out_dir=$OUT_DIR
split_count=$SPLIT_COUNT

set -e

mkdir -p $out_dir
cp -r /quick_start $out_dir/

mkdir -p $out_dir/0/data
cd $out_dir/0/data
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
tar zxvf preprocessed_data.tar.gz
rm preprocessed_data.tar.gz

split -d --number=l/$split_count -a 5 train.txt train.
mv train.00000 train.txt

cd $out_dir
end=$(expr $split_count - 1)
for i in $(seq 1 $end); do
mkdir -p $i/data
cp -r 0/data/* $i/data
mv $i/data/train.`printf %05d $i` $i/data/train.txt
done;
6 changes: 6 additions & 0 deletions doc/howto/usage/k8s/src/k8s_train/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM paddledev/paddle:cpu-latest

COPY start.sh /root/
COPY start_paddle.py /root/
RUN chmod +x /root/start.sh
CMD ["bash"," -c","/root/start.sh"]
5 changes: 5 additions & 0 deletions doc/howto/usage/k8s/src/k8s_train/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:

```
docker build . -t train-image-name
```
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
#!/bin/sh

set -eu

jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
cd /root
cp -rf $jobconfig .
cd $TRAIN_CONFIG_DIR

cp -rf $jobconfig/* .

python /root/start_paddle.py \
--dot_period=10 \
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
--ports_num=$CONF_PADDLE_PORTS_NUM \
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
--log_period=50 \
--num_passes=10 \
--trainer_count=4 \
--trainer_count=$TRAINER_COUNT \
--saving_period=1 \
--local=0 \
--config=./trainer_config.py \
--config=trainer_config.lr.py \
--use_gpu=0
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
API = "/api/v1/namespaces/"
JOBSELECTOR = "labelSelector=job-name="
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
JOB_PATH_DATA = JOB_PATH + "/data"
JOB_PATH_OUTPUT = JOB_PATH + "/output"
JOBNAME = os.getenv("JOB_NAME")
NAMESPACE = os.getenv("JOB_NAMESPACE")
Expand All @@ -33,6 +32,8 @@
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")

tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'


def refine_unknown_args(cmd_args):
'''
Expand Down Expand Up @@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
for pod in podlist["items"]:
if pod["status"]["phase"] == "Running":
running += 1
print "waiting for pods running, require:", require, "running:", running
if require == running:
return True
return False
Expand All @@ -79,8 +81,17 @@ def getPodList():

pod = API + NAMESPACE + "/pods?"
job = JOBNAME
return requests.get(apiserver + pod + JOBSELECTOR + job,
verify=False).json()
if os.path.isfile(tokenpath):
tokenfile = open(tokenpath, mode='r')
token = tokenfile.read()
Bearer = "Bearer " + token
headers = {"Authorization": Bearer}
return requests.get(apiserver + pod + JOBSELECTOR + job,
headers=headers,
verify=False).json()
else:
return requests.get(apiserver + pod + JOBSELECTOR + job,
verify=False).json()


def getIdMap(podlist):
Expand Down Expand Up @@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None):
if not os.path.exists(JOB_PATH_OUTPUT):
os.makedirs(JOB_PATH_OUTPUT)
os.mkdir(logDir)
copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
"/" + str(trainerId) + " ./data"
copyCommand = 'cp -rf ' + JOB_PATH + \
"/" + str(trainerId) + "/data/*" + " ./data/"
os.system(copyCommand)
startPserver = 'nohup paddle pserver' + \
" --port=" + str(PADDLE_PORT) + \
Expand All @@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None):
print startPserver
os.system(startPserver)
# wait until pservers completely start
time.sleep(10)
startTrainer = program + args + " > " + \
logDir + "/train.log 2>&1 < /dev/null"
time.sleep(20)
startTrainer = program + args + " 2>&1 | tee " + \
logDir + "/train.log"
print startTrainer
os.system(startTrainer)

Expand All @@ -152,7 +163,7 @@ def startPaddle(idMap={}, train_args_dict=None):
podlist = getPodList()
# need to wait until all pods are running
while not isPodAllRunning(podlist):
time.sleep(10)
time.sleep(20)
podlist = getPodList()
idMap = getIdMap(podlist)
startPaddle(idMap, train_args_dict)
Binary file added doc/howto/usage/k8s/src/worker_security_group.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit f1a8f7a

Please sign in to comment.