forked from apache/airflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDockerfile.ci
411 lines (359 loc) · 17.8 KB
/
Dockerfile.ci
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# WARNING: THIS DOCKERFILE IS NOT INTENDED FOR PRODUCTION USE OR DEPLOYMENT.
#
ARG PYTHON_BASE_IMAGE="python:3.7-slim-buster"
FROM ${PYTHON_BASE_IMAGE} as main
# Nolog bash flag is currently ignored - but you can replace it with other flags (for example
# xtrace - to show commands executed)
SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "nounset", "-o", "nolog", "-c"]
ARG PYTHON_BASE_IMAGE
ARG AIRFLOW_IMAGE_REPOSITORY="https://github.com/apache/airflow"
# By increasing this number we can do force build of all dependencies
ARG DEPENDENCIES_EPOCH_NUMBER="6"
# Make sure noninteractive debian install is used and language variables set
ENV PYTHON_BASE_IMAGE=${PYTHON_BASE_IMAGE} \
DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 \
DEPENDENCIES_EPOCH_NUMBER=${DEPENDENCIES_EPOCH_NUMBER} \
INSTALL_MYSQL_CLIENT="true" \
INSTALL_MSSQL_CLIENT="true" \
INSTALL_POSTGRES_CLIENT="true"
RUN echo "Base image version: ${PYTHON_BASE_IMAGE}"
ARG ADDITIONAL_DEV_APT_DEPS=""
ARG DEV_APT_COMMAND="\
curl --silent --fail --location https://deb.nodesource.com/setup_14.x | bash - \
&& curl --silent --fail https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - >/dev/null 2>&1 \
&& echo 'deb https://dl.yarnpkg.com/debian/ stable main' > /etc/apt/sources.list.d/yarn.list"
ARG ADDITIONAL_DEV_APT_COMMAND=""
ARG ADDITIONAL_DEV_ENV_VARS=""
ENV DEV_APT_COMMAND=${DEV_APT_COMMAND} \
ADDITIONAL_DEV_APT_DEPS=${ADDITIONAL_DEV_APT_DEPS} \
ADDITIONAL_DEV_APT_COMMAND=${ADDITIONAL_DEV_APT_COMMAND}
# Install basic and additional apt dependencies
RUN apt-get update \
&& apt-get install --no-install-recommends -yqq apt-utils >/dev/null 2>&1 \
&& apt-get install -y --no-install-recommends curl gnupg2 \
&& mkdir -pv /usr/share/man/man1 \
&& mkdir -pv /usr/share/man/man7 \
&& export ${ADDITIONAL_DEV_ENV_VARS?} \
&& bash -o pipefail -o errexit -o nounset -o nolog -c "${DEV_APT_COMMAND}" \
&& bash -o pipefail -o errexit -o nounset -o nolog -c "${ADDITIONAL_DEV_APT_COMMAND}" \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
apt-utils \
build-essential \
dirmngr \
dumb-init \
freetds-bin \
freetds-dev \
git \
graphviz \
gosu \
libffi-dev \
libldap2-dev \
libkrb5-dev \
libpq-dev \
libsasl2-2 \
libsasl2-dev \
libsasl2-modules \
libssl-dev \
libenchant-dev \
locales \
netcat \
nodejs \
rsync \
sasl2-bin \
sudo \
unixodbc \
unixodbc-dev \
yarn \
${ADDITIONAL_DEV_APT_DEPS} \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Only copy mysql/mssql installation scripts for now - so that changing the other
# scripts which are needed much later will not invalidate the docker layer here.
COPY scripts/docker/install_mysql.sh scripts/docker/install_mssql.sh scripts/docker/install_postgres.sh /scripts/docker/
# We run scripts with bash here to make sure we can execute the scripts. Changing to +x might have an
# unexpected result - the cache for Dockerfiles might get invalidated in case the host system
# had different umask set and group x bit was not set. In Azure the bit might be not set at all.
# That also protects against AUFS Docker backen dproblem where changing the executable bit required sync
RUN bash /scripts/docker/install_mysql.sh prod \
&& bash /scripts/docker/install_mysql.sh dev \
&& bash /scripts/docker/install_mssql.sh \
&& bash /scripts/docker/install_postgres.sh dev \
&& adduser --gecos "First Last,RoomNumber,WorkPhone,HomePhone" --disabled-password \
--quiet "airflow" --home "/home/airflow" \
&& echo -e "airflow\nairflow" | passwd airflow 2>&1 \
&& echo "airflow ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/airflow \
&& chmod 0440 /etc/sudoers.d/airflow
ARG RUNTIME_APT_DEPS="\
libgcc-8-dev \
apt-transport-https \
bash-completion \
ca-certificates \
software-properties-common \
krb5-user \
krb5-user \
ldap-utils \
less \
lsb-release \
net-tools \
openssh-client \
openssh-server \
postgresql-client \
sqlite3 \
tmux \
unzip \
vim \
xxd"
# Install Helm
ARG HELM_VERSION="v3.6.3"
RUN SYSTEM=$(uname -s | tr '[:upper:]' '[:lower:]') \
&& HELM_URL="https://get.helm.sh/helm-${HELM_VERSION}-${SYSTEM}-amd64.tar.gz" \
&& curl --silent --location "${HELM_URL}" | tar -xz -O "${SYSTEM}"-amd64/helm > /usr/local/bin/helm \
&& chmod +x /usr/local/bin/helm
ARG ADDITIONAL_RUNTIME_APT_DEPS=""
ARG RUNTIME_APT_COMMAND=""
ARG ADDITIONAL_RUNTIME_APT_COMMAND=""
ARG ADDITIONAL_DEV_APT_ENV=""
ARG ADDITIONAL_RUNTIME_APT_ENV=""
ARG DOCKER_CLI_VERSION=19.03.9
ARG HOME=/root
ARG AIRFLOW_HOME=/root/airflow
ARG AIRFLOW_SOURCES=/opt/airflow
ENV RUNTIME_APT_DEP=${RUNTIME_APT_DEPS} \
ADDITIONAL_RUNTIME_APT_DEPS=${ADDITIONAL_RUNTIME_APT_DEPS} \
RUNTIME_APT_COMMAND=${RUNTIME_APT_COMMAND} \
ADDITIONAL_RUNTIME_APT_COMMAND=${ADDITIONAL_RUNTIME_APT_COMMAND}\
DOCKER_CLI_VERSION=${DOCKER_CLI_VERSION} \
HOME=${HOME} \
AIRFLOW_HOME=${AIRFLOW_HOME} \
AIRFLOW_SOURCES=${AIRFLOW_SOURCES}
# Note missing man directories on debian-buster
# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199
RUN mkdir -pv /usr/share/man/man1 \
&& mkdir -pv /usr/share/man/man7 \
&& export ${ADDITIONAL_DEV_APT_ENV?} \
&& export ${ADDITIONAL_RUNTIME_APT_ENV?} \
&& bash -o pipefail -o errexit -o nounset -o nolog -c "${RUNTIME_APT_COMMAND}" \
&& bash -o pipefail -o errexit -o nounset -o nolog -c "${ADDITIONAL_RUNTIME_APT_COMMAND}" \
&& apt-get update \
&& apt-get install --no-install-recommends -y \
${RUNTIME_APT_DEPS} \
${ADDITIONAL_RUNTIME_APT_DEPS} \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& curl --silent "https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_CLI_VERSION}.tgz" \
| tar -C /usr/bin --strip-components=1 -xvzf - docker/docker
WORKDIR ${AIRFLOW_SOURCES}
RUN mkdir -pv ${AIRFLOW_HOME} && \
mkdir -pv ${AIRFLOW_HOME}/dags && \
mkdir -pv ${AIRFLOW_HOME}/logs
ARG AIRFLOW_REPO=apache/airflow
ARG AIRFLOW_BRANCH=main
# Airflow Extras installed
ARG AIRFLOW_EXTRAS="all"
ARG ADDITIONAL_AIRFLOW_EXTRAS=""
# Allows to override constraints source
ARG CONSTRAINTS_GITHUB_REPOSITORY="apache/airflow"
ARG AIRFLOW_CONSTRAINTS="constraints"
ARG AIRFLOW_CONSTRAINTS_REFERENCE=""
ARG AIRFLOW_CONSTRAINTS_LOCATION=""
ARG DEFAULT_CONSTRAINTS_BRANCH="constraints-main"
# By changing the epoch we can force reinstalling Airflow and pip all dependencies
# It can also be overwritten manually by setting the AIRFLOW_CI_BUILD_EPOCH environment variable.
ARG AIRFLOW_CI_BUILD_EPOCH="3"
ARG AIRFLOW_PRE_CACHED_PIP_PACKAGES="true"
# By default in the image, we are installing all providers when installing from sources
ARG INSTALL_PROVIDERS_FROM_SOURCES="true"
ARG INSTALL_FROM_PYPI="true"
ARG AIRFLOW_PIP_VERSION=21.3.1
# Setup PIP
# By default PIP install run without cache to make image smaller
ARG PIP_NO_CACHE_DIR="true"
# By default PIP has progress bar but you can disable it.
ARG PIP_PROGRESS_BAR="on"
# Optimizing installation of Cassandra driver (in case there are no prebuilt wheels which is the
# case as of 20.04.2021 with Python 3.9
# Speeds up building the image - cassandra driver without CYTHON saves around 10 minutes
ARG CASS_DRIVER_NO_CYTHON="1"
# Build cassandra driver on multiple CPUs
ARG CASS_DRIVER_BUILD_CONCURRENCY="8"
ARG AIRFLOW_VERSION="2.3.0.dev"
ENV AIRFLOW_REPO=${AIRFLOW_REPO}\
AIRFLOW_BRANCH=${AIRFLOW_BRANCH} \
AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS}${ADDITIONAL_AIRFLOW_EXTRAS:+,}${ADDITIONAL_AIRFLOW_EXTRAS} \
CONSTRAINTS_GITHUB_REPOSITORY=${CONSTRAINTS_GITHUB_REPOSITORY} \
AIRFLOW_CONSTRAINTS=${AIRFLOW_CONSTRAINTS} \
AIRFLOW_CONSTRAINTS_REFERENCE=${AIRFLOW_CONSTRAINTS_REFERENCE} \
AIRFLOW_CONSTRAINTS_LOCATION=${AIRFLOW_CONSTRAINTS_LOCATION} \
DEFAULT_CONSTRAINTS_BRANCH=${DEFAULT_CONSTRAINTS_BRANCH} \
AIRFLOW_CI_BUILD_EPOCH=${AIRFLOW_CI_BUILD_EPOCH} \
AIRFLOW_PRE_CACHED_PIP_PACKAGES=${AIRFLOW_PRE_CACHED_PIP_PACKAGES} \
INSTALL_PROVIDERS_FROM_SOURCES=${INSTALL_PROVIDERS_FROM_SOURCES} \
INSTALL_FROM_PYPI=${INSTALL_FROM_PYPI} \
AIRFLOW_VERSION=${AIRFLOW_VERSION} \
AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} \
# In the CI image we always:
# * install MySQL, MsSQL
# * install airflow from current sources, not from PyPI package
# * install airflow without `--user` flag
# * install airflow in editable mode
# * install always current version of airflow
INSTALL_MYSQL_CLIENT="true" \
INSTALL_MSSQL_CLIENT="true" \
INSTALL_POSTGRES_CLIENT="true" \
AIRFLOW_INSTALLATION_METHOD="." \
AIRFLOW_INSTALL_EDITABLE_FLAG="--editable" \
AIRFLOW_VERSION_SPECIFICATION="" \
PIP_NO_CACHE_DIR=${PIP_NO_CACHE_DIR} \
PIP_PROGRESS_BAR=${PIP_PROGRESS_BAR} \
CASS_DRIVER_BUILD_CONCURRENCY=${CASS_DRIVER_BUILD_CONCURRENCY} \
CASS_DRIVER_NO_CYTHON=${CASS_DRIVER_NO_CYTHON}
RUN echo "Airflow version: ${AIRFLOW_VERSION}"
# Those are additional constraints that are needed for some extras but we do not want to
# force them on the main Airflow package. Those limitations are:
# * certifi<2021.0.0: required by snowflake provider
# * dill<0.3.3 required by apache-beam
# * google-ads<14.0.1 required to prevent updating google-python-api>=2.0.0
ARG EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="dill<0.3.3 certifi<2021.0.0 google-ads<14.0.1"
ARG UPGRADE_TO_NEWER_DEPENDENCIES="false"
ENV EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} \
UPGRADE_TO_NEWER_DEPENDENCIES=${UPGRADE_TO_NEWER_DEPENDENCIES}
# Copy all scripts required for installation - changing any of those should lead to
# rebuilding from here
COPY scripts/docker/install_pip_version.sh scripts/docker/install_airflow_dependencies_from_branch_tip.sh \
scripts/docker/common.sh \
/scripts/docker/
# We are first creating a venv where all python packages and .so binaries needed by those are
# installed.
# In case of CI builds we want to pre-install main version of airflow dependencies so that
# We do not have to always reinstall it from the scratch.
# And is automatically reinstalled from the scratch every time patch release of python gets released
# The Airflow (and providers in case INSTALL_PROVIDERS_FROM_SOURCES is "false")
# are uninstalled, only dependencies remain.
# the cache is only used when "upgrade to newer dependencies" is not set to automatically
# account for removed dependencies (we do not install them in the first place)
RUN echo -e "\n\e[32mThe 'Running pip as the root user' warnings below are not valid but we can't disable them :(\e[0m\n"; \
echo -e "\n\e[34mSee https://github.com/pypa/pip/issues/10556 for details.\e[0m\n" ; \
bash /scripts/docker/install_pip_version.sh; \
if [[ ${AIRFLOW_PRE_CACHED_PIP_PACKAGES} == "true" && \
${UPGRADE_TO_NEWER_DEPENDENCIES} == "false" ]]; then \
bash /scripts/docker/install_airflow_dependencies_from_branch_tip.sh; \
fi
# The PATH is needed for PIPX to find the tools installed
ENV PATH="/root/.local/bin:${PATH}"
COPY scripts/docker/install_pipx_tools.sh /scripts/docker/
# Install useful command line tools in their own virtualenv so that they do not clash with
# dependencies installed in Airflow
RUN bash /scripts/docker/install_pipx_tools.sh
# Copy package.json and yarn.lock to install node modules
# this way even if other static check files change, node modules will not need to be installed
# we want to keep node_modules so we can do this step separately from compiling assets
COPY airflow/www/package.json airflow/www/yarn.lock ${AIRFLOW_SOURCES}/airflow/www/
COPY scripts/docker/prepare_node_modules.sh /scripts/docker/
# Package JS/css for production
RUN bash /scripts/docker/prepare_node_modules.sh
# Copy all the needed www/ for assets compilation. Done as two separate COPY
# commands so as otherwise it copies the _contents_ of static/ in to www/
COPY airflow/www/webpack.config.js ${AIRFLOW_SOURCES}/airflow/www/
COPY airflow/www/static ${AIRFLOW_SOURCES}/airflow/www/static/
COPY scripts/docker/compile_www_assets.sh /scripts/docker/
# Build artifacts without removing temporary artifacts (we will need them for incremental changes)
# in build mode
RUN REMOVE_ARTIFACTS="false" BUILD_TYPE="build" bash /scripts/docker/compile_www_assets.sh
# Airflow sources change frequently but dependency configuration won't change that often
# We copy setup.py and other files needed to perform setup of dependencies
# So in case setup.py changes we can install latest dependencies required.
COPY setup.py ${AIRFLOW_SOURCES}/setup.py
COPY setup.cfg ${AIRFLOW_SOURCES}/setup.cfg
COPY airflow/__init__.py ${AIRFLOW_SOURCES}/airflow/__init__.py
COPY scripts/docker/install_airflow.sh /scripts/docker/
# The goal of this line is to install the dependencies from the most current setup.py from sources
# This will be usually incremental small set of packages in CI optimized build, so it will be very fast
# In non-CI optimized build this will install all dependencies before installing sources.
# Usually we will install versions based on the dependencies in setup.py and upgraded only if needed.
# But in cron job we will install latest versions matching setup.py to see if there is no breaking change
# and push the constraints if everything is successful
RUN if [[ ${INSTALL_FROM_PYPI} == "true" ]]; then \
bash /scripts/docker/install_airflow.sh; \
fi
COPY scripts/in_container/entrypoint_ci.sh /entrypoint
RUN chmod a+x /entrypoint
COPY scripts/docker/install_pip_version.sh scripts/docker/install_additional_dependencies.sh /scripts/docker/
# Additional python deps to install
ARG ADDITIONAL_PYTHON_DEPS=""
RUN bash /scripts/docker/install_pip_version.sh; \
if [[ -n "${ADDITIONAL_PYTHON_DEPS}" ]]; then \
bash /scripts/docker/install_additional_dependencies.sh; \
fi
# Install autocomplete for airflow
RUN if command -v airflow; then \
register-python-argcomplete airflow >> ~/.bashrc ; \
fi
# Install autocomplete for Kubectl
RUN echo "source /etc/bash_completion" >> ~/.bashrc
# We can copy everything here. The Context is filtered by dockerignore. This makes sure we are not
# copying over stuff that is accidentally generated or that we do not need (such as egg-info)
# if you want to add something that is missing and you expect to see it in the image you can
# add it with ! in .dockerignore next to the airflow, test etc. directories there
COPY . ${AIRFLOW_SOURCES}/
WORKDIR ${AIRFLOW_SOURCES}
ARG BUILD_ID
ARG COMMIT_SHA
ARG AIRFLOW_IMAGE_DATE_CREATED
ENV PATH="/files/bin/:/opt/airflow/scripts/in_container/bin/:${PATH}" \
GUNICORN_CMD_ARGS="--worker-tmp-dir /dev/shm/" \
BUILD_ID=${BUILD_ID} \
COMMIT_SHA=${COMMIT_SHA}
# This one is to workaround https://github.com/apache/airflow/issues/17546
# issue with /usr/lib/x86_64-linux-gnu/libstdc++.so.6: cannot allocate memory in static TLS block
# We do not yet a more "correct" solution to the problem but in order to avoid raising new issues
# by users of the prod image, we implement the workaround now.
# The side effect of this is slightly (in the range of 100s of milliseconds) slower load for any
# binary started and a little memory used for Heap allocated by initialization of libstdc++
# This overhead is not happening for binaries that already link dynamically libstdc++
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libstdc++.so.6"
# Link dumb-init for backwards compatibility (so that older images also work)
RUN ln -sf /usr/bin/dumb-init /usr/local/bin/dumb-init
EXPOSE 8080
LABEL org.apache.airflow.distro="debian" \
org.apache.airflow.module="airflow" \
org.apache.airflow.component="airflow" \
org.apache.airflow.image="airflow-ci" \
org.apache.airflow.version="${AIRFLOW_VERSION}" \
org.apache.airflow.uid="0" \
org.apache.airflow.gid="0" \
org.apache.airflow.build-id="${BUILD_ID}" \
org.apache.airflow.commit-sha="${COMMIT_SHA}" \
org.opencontainers.image.source="${AIRFLOW_IMAGE_REPOSITORY}" \
org.opencontainers.image.created="${AIRFLOW_IMAGE_DATE_CREATED}" \
org.opencontainers.image.authors="[email protected]" \
org.opencontainers.image.url="https://airflow.apache.org" \
org.opencontainers.image.documentation="https://github.com/apache/airflow/IMAGES.rst" \
org.opencontainers.image.source="https://github.com/apache/airflow" \
org.opencontainers.image.version="${AIRFLOW_VERSION}" \
org.opencontainers.image.revision="${COMMIT_SHA}" \
org.opencontainers.image.vendor="Apache Software Foundation" \
org.opencontainers.image.licenses="Apache-2.0" \
org.opencontainers.image.ref.name="airflow-ci-image" \
org.opencontainers.image.title="Continuous Integration Airflow Image" \
org.opencontainers.image.description="Installed Apache Airflow with Continuous Integration dependencies"
ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint"]
CMD []