forked from huggingface/transformers
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add setup for TPU CI to run every hour. (huggingface#6219)
* Add setup for TPU CI to run every hour. * Re-organize config.yml Co-authored-by: Lysandre <[email protected]>
- Loading branch information
1 parent
6695450
commit 1b8a7ff
Showing
6 changed files
with
253 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,67 @@ | ||
version: 2 | ||
version: 2.1 | ||
orbs: | ||
gcp-gke: circleci/[email protected] | ||
go: circleci/[email protected] | ||
|
||
# TPU REFERENCES | ||
references: | ||
checkout_ml_testing: &checkout_ml_testing | ||
run: | ||
name: Checkout ml-testing-accelerators | ||
command: | | ||
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git | ||
cd ml-testing-accelerators | ||
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable | ||
git checkout stable | ||
build_push_docker: &build_push_docker | ||
run: | ||
name: Configure Docker | ||
command: | | ||
gcloud --quiet auth configure-docker | ||
cd docker/transformers-pytorch-tpu | ||
if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1"; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi | ||
docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" | ||
deploy_cluster: &deploy_cluster | ||
run: | ||
name: Deploy the job on the kubernetes cluster | ||
command: | | ||
go get github.com/google/go-jsonnet/cmd/jsonnet && \ | ||
export PATH=$PATH:$HOME/go/bin && \ | ||
kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \ | ||
job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \ | ||
job_name=${job_name#job.batch/} && \ | ||
job_name=${job_name% created} && \ | ||
echo "Waiting on kubernetes job: $job_name" && \ | ||
i=0 && \ | ||
# 30 checks spaced 30s apart = 900s total. | ||
max_checks=30 && \ | ||
status_code=2 && \ | ||
# Check on the job periodically. Set the status code depending on what | ||
# happened to the job in Kubernetes. If we try max_checks times and | ||
# still the job hasn't finished, give up and return the starting | ||
# non-zero status code. | ||
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ | ||
echo "Done waiting. Job status code: $status_code" && \ | ||
# Allow time for logs to flush. | ||
sleep 60 && \ | ||
echo "JOB_NAME: $job_name" && \ | ||
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID && \ | ||
echo "Done with log retrieval attempt." && \ | ||
gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \ | ||
exit $status_code | ||
delete_gke_jobs: &delete_gke_jobs | ||
run: | ||
name: Delete GKE Jobs | ||
command: | | ||
# Match jobs whose age matches patterns like '1h' or '1d', i.e. any job | ||
# that has been around longer than 1hr. First print all columns for | ||
# matches, then execute the delete. | ||
kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}' | ||
kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}') | ||
jobs: | ||
run_tests_torch_and_tf: | ||
working_directory: ~/transformers | ||
|
@@ -50,7 +113,6 @@ jobs: | |
- store_artifacts: | ||
path: ~/transformers/output.txt | ||
destination: test_output.txt | ||
|
||
run_tests_tf: | ||
working_directory: ~/transformers | ||
docker: | ||
|
@@ -193,6 +255,35 @@ jobs: | |
- checkout | ||
- run: pip install requests | ||
- run: python ./utils/link_tester.py | ||
|
||
# TPU JOBS | ||
run_examples_tpu: | ||
docker: | ||
- image: circleci/python:3.6 | ||
environment: | ||
OMP_NUM_THREADS: 1 | ||
resource_class: xlarge | ||
parallelism: 1 | ||
steps: | ||
- checkout | ||
- go/install | ||
- *checkout_ml_testing | ||
- gcp-gke/install | ||
- gcp-gke/update-kubeconfig-with-credentials: | ||
cluster: $GKE_CLUSTER | ||
perform-login: true | ||
- setup_remote_docker | ||
- *build_push_docker | ||
- *deploy_cluster | ||
cleanup-gke-jobs: | ||
docker: | ||
- image: circleci/python:3.6 | ||
steps: | ||
- gcp-gke/install | ||
- gcp-gke/update-kubeconfig-with-credentials: | ||
cluster: $GKE_CLUSTER | ||
perform-login: true | ||
- *delete_gke_jobs | ||
workflow_filters: &workflow_filters | ||
filters: | ||
branches: | ||
|
@@ -211,3 +302,15 @@ workflows: | |
- run_tests_tf | ||
- build_doc | ||
- deploy_doc: *workflow_filters | ||
tpu_testing_jobs: | ||
triggers: | ||
- schedule: | ||
# Set to run at the first minute of every hour. | ||
cron: "0 8 * * *" | ||
filters: | ||
branches: | ||
only: | ||
- master | ||
jobs: | ||
- cleanup-gke-jobs | ||
- run_examples_tpu |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
FROM google/cloud-sdk:slim | ||
|
||
# Build args. | ||
ARG GITHUB_REF=refs/heads/master | ||
|
||
# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7 | ||
# wheels available; see below. | ||
ENV PYTHON_VERSION=3.6 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
cmake \ | ||
git \ | ||
curl \ | ||
ca-certificates | ||
|
||
# Install conda and python. | ||
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 | ||
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ | ||
chmod +x ~/miniconda.sh && \ | ||
~/miniconda.sh -b && \ | ||
rm ~/miniconda.sh | ||
|
||
ENV PATH=/root/miniconda3/bin:$PATH | ||
|
||
RUN conda create -y --name container python=$PYTHON_VERSION | ||
|
||
# Run the rest of commands within the new conda env. | ||
# Use absolute path to appease Codefactor. | ||
SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"] | ||
RUN conda install -y python=$PYTHON_VERSION mkl | ||
|
||
RUN pip uninstall -y torch && \ | ||
# Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m | ||
gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ | ||
gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ | ||
gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ | ||
pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
apt-get install -y libomp5 | ||
|
||
ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib | ||
|
||
|
||
# Install huggingface/transformers at the current PR, plus dependencies. | ||
RUN git clone https://github.com/huggingface/transformers.git && \ | ||
cd transformers && \ | ||
git fetch origin $GITHUB_REF:CI && \ | ||
git checkout CI && \ | ||
cd .. && \ | ||
pip install ./transformers && \ | ||
pip install -r ./transformers/examples/requirements.txt && \ | ||
pip install pytest | ||
|
||
RUN python -c "import torch_xla; print(torch_xla.__version__)" | ||
RUN python -c "import transformers as trf; print(trf.__version__)" | ||
RUN conda init bash | ||
COPY docker-entrypoint.sh /usr/local/bin/ | ||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh | ||
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] | ||
CMD ["bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
local base = import 'templates/base.libsonnet'; | ||
local tpus = import 'templates/tpus.libsonnet'; | ||
local utils = import "templates/utils.libsonnet"; | ||
local volumes = import "templates/volumes.libsonnet"; | ||
|
||
local bertBaseCased = base.BaseTest { | ||
frameworkPrefix: "hf", | ||
modelName: "bert-base-cased", | ||
mode: "example", | ||
configMaps: [], | ||
|
||
timeout: 3600, # 1 hour, in seconds | ||
|
||
image: std.extVar('image'), | ||
imageTag: std.extVar('image-tag'), | ||
|
||
tpuSettings+: { | ||
softwareVersion: "pytorch-nightly", | ||
}, | ||
accelerator: tpus.v3_8, | ||
|
||
volumeMap+: { | ||
datasets: volumes.PersistentVolumeSpec { | ||
name: "huggingface-cluster-disk", | ||
mountPath: "/datasets", | ||
}, | ||
}, | ||
command: utils.scriptCommand( | ||
||| | ||
python -m pytest -s transformers/examples/test_xla_examples.py -v | ||
test_exit_code=$? | ||
echo "\nFinished running commands.\n" | ||
test $test_exit_code -eq 0 | ||
||| | ||
), | ||
}; | ||
|
||
bertBaseCased.oneshotJob |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
apiVersion: v1 | ||
kind: PersistentVolume | ||
metadata: | ||
name: huggingface-cluster-disk | ||
spec: | ||
storageClassName: "" | ||
capacity: | ||
storage: 500Gi | ||
accessModes: | ||
- ReadOnlyMany | ||
claimRef: | ||
namespace: default | ||
name: huggingface-cluster-disk-claim | ||
gcePersistentDisk: | ||
pdName: huggingface-cluster-disk | ||
fsType: ext4 | ||
readOnly: true | ||
--- | ||
apiVersion: v1 | ||
kind: PersistentVolumeClaim | ||
metadata: | ||
name: huggingface-cluster-disk-claim | ||
spec: | ||
# Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass. | ||
# A nil storageClassName value uses the default StorageClass. For details, see | ||
# https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1 | ||
storageClassName: "" | ||
accessModes: | ||
- ReadOnlyMany | ||
resources: | ||
requests: | ||
storage: 1Ki |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
source ~/.bashrc | ||
echo "running docker-entrypoint.sh" | ||
conda activate container | ||
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS | ||
echo "printed TPU info" | ||
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" | ||
exec "$@"#!/bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters