From d67c4d9559ae766bb08e411617a278e7f1f6d294 Mon Sep 17 00:00:00 2001 From: wangyang0918 Date: Fri, 5 Mar 2021 17:29:11 +0800 Subject: [PATCH] [FLINK-21382][docs] Update documentation for standalone Flink on Kubernetes with standby JobManagers This closes #15248. --- .../standalone/kubernetes.md | 133 ++++++++++++++++- .../standalone/kubernetes.md | 135 +++++++++++++++++- 2 files changed, 261 insertions(+), 7 deletions(-) diff --git a/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md b/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md index 9731762d7703f..84c53ac616d80 100644 --- a/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md +++ b/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md @@ -218,6 +218,15 @@ data: Moreover, you have to start the JobManager and TaskManager pods with a service account which has the permissions to create, edit, delete ConfigMaps. See [how to configure service accounts for pods](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/) for more information. +When High-Availability is enabled, Flink will use its own HA-services for service discovery. +Therefore, JobManager pods should be started with their IP address instead of a Kubernetes service as its `jobmanager.rpc.address`. +Refer to the [appendix](#appendix) for full configuration. + +#### Standby JobManagers + +Usually, it is enough to only start a single JobManager pod, because Kubernetes will restart it once the pod crashes. +If you want to achieve faster recovery, configure the `replicas` in `jobmanager-session-deployment-ha.yaml` or `parallelism` in `jobmanager-application-ha.yaml` to a value greater than `1` to start standby JobManagers. + ### Enabling Queryable State You can access the queryable state of TaskManager if you create a `NodePort` service for it: @@ -296,7 +305,7 @@ data: logger.netty.level = OFF ``` -`jobmanager-service.yaml` +`jobmanager-service.yaml` Optional service, which is only necessary for non-HA mode. ```yaml apiVersion: v1 kind: Service @@ -354,7 +363,7 @@ spec: ### Session cluster resource definitions -`jobmanager-session-deployment.yaml` +`jobmanager-session-deployment-non-ha.yaml` ```yaml apiVersion: apps/v1 kind: Deployment @@ -404,6 +413,64 @@ spec: path: log4j-console.properties ``` +`jobmanager-session-deployment-ha.yaml` +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flink-jobmanager +spec: + replicas: 1 # Set the value to greater than 1 to start standby JobManagers + selector: + matchLabels: + app: flink + component: jobmanager + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + # The following args overwrite the value of jobmanager.rpc.address configured in the configuration config map to POD_IP. + args: ["jobmanager", "$(POD_IP)"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + serviceAccountName: flink-service-account # Service account which has the permissions to create, edit, delete ConfigMaps + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties +``` + `taskmanager-session-deployment.yaml` ```yaml apiVersion: apps/v1 @@ -454,7 +521,7 @@ spec: ### Application cluster resource definitions -`jobmanager-application.yaml` +`jobmanager-application-non-ha.yaml` ```yaml apiVersion: batch/v1 kind: Job @@ -506,6 +573,66 @@ spec: path: /host/path/to/job/artifacts ``` +`jobmanager-application-ha.yaml` +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: flink-jobmanager +spec: + parallelism: 1 # Set the value to greater than 1 to start standby JobManagers + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + restartPolicy: OnFailure + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + # The following args overwrite the value of jobmanager.rpc.address configured in the configuration config map to POD_IP. + args: ["standalone-job", "--host", "$(POD_IP)", "--job-classname", "com.job.ClassName", , ] # optional arguments: ["--job-id", "", "--fromSavepoint", "/path/to/savepoint", "--allowNonRestoredState"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + - name: job-artifacts-volume + mountPath: /opt/flink/usrlib + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + serviceAccountName: flink-service-account # Service account which has the permissions to create, edit, delete ConfigMaps + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties + - name: job-artifacts-volume + hostPath: + path: /host/path/to/job/artifacts +``` + `taskmanager-job-deployment.yaml` ```yaml apiVersion: apps/v1 diff --git a/docs/content/docs/deployment/resource-providers/standalone/kubernetes.md b/docs/content/docs/deployment/resource-providers/standalone/kubernetes.md index c92775078780d..5fa36f61724ea 100644 --- a/docs/content/docs/deployment/resource-providers/standalone/kubernetes.md +++ b/docs/content/docs/deployment/resource-providers/standalone/kubernetes.md @@ -192,7 +192,7 @@ For high availability on Kubernetes, you can use the [existing high availability #### Kubernetes High-Availability Services -Session Mode and Application Mode clusters support using the [Kubernetes high availability service]({{< ref "docs/deployment/ha/kubernetes_ha" >}}). +Session Mode and Application Mode clusters support using the [Kubernetes high availability service]({{< ref "docs/deployment/ha/kubernetes_ha" >}}). You need to add the following Flink config options to [flink-configuration-configmap.yaml](#common-cluster-resource-definitions). Note The filesystem which corresponds to the scheme of your configured HA storage directory must be available to the runtime. Refer to [custom Flink image]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#advanced-customization) and [enable plugins]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#using-filesystem-plugins) for more information. @@ -218,6 +218,15 @@ data: Moreover, you have to start the JobManager and TaskManager pods with a service account which has the permissions to create, edit, delete ConfigMaps. See [how to configure service accounts for pods](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/) for more information. +When High-Availability is enabled, Flink will use its own HA-services for service discovery. +Therefore, JobManager pods should be started with their IP address instead of a Kubernetes service as its `jobmanager.rpc.address`. +Refer to the [appendix](#appendix) for full configuration. + +#### Standby JobManagers + +Usually, it is enough to only start a single JobManager pod, because Kubernetes will restart it once the pod crashes. +If you want to achieve faster recovery, configure the `replicas` in `jobmanager-session-deployment-ha.yaml` or `parallelism` in `jobmanager-application-ha.yaml` to a value greater than `1` to start standby JobManagers. + ### Enabling Queryable State You can access the queryable state of TaskManager if you create a `NodePort` service for it: @@ -296,7 +305,7 @@ data: logger.netty.level = OFF ``` -`jobmanager-service.yaml` +`jobmanager-service.yaml` Optional service, which is only necessary for non-HA mode. ```yaml apiVersion: v1 kind: Service @@ -354,7 +363,7 @@ spec: ### Session cluster resource definitions -`jobmanager-session-deployment.yaml` +`jobmanager-session-deployment-non-ha.yaml` ```yaml apiVersion: apps/v1 kind: Deployment @@ -404,6 +413,64 @@ spec: path: log4j-console.properties ``` +`jobmanager-session-deployment-ha.yaml` +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flink-jobmanager +spec: + replicas: 1 # Set the value to greater than 1 to start standby JobManagers + selector: + matchLabels: + app: flink + component: jobmanager + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + # The following args overwrite the value of jobmanager.rpc.address configured in the configuration config map to POD_IP. + args: ["jobmanager", "$(POD_IP)"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + serviceAccountName: flink-service-account # Service account which has the permissions to create, edit, delete ConfigMaps + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties +``` + `taskmanager-session-deployment.yaml` ```yaml apiVersion: apps/v1 @@ -454,7 +521,7 @@ spec: ### Application cluster resource definitions -`jobmanager-application.yaml` +`jobmanager-application-non-ha.yaml` ```yaml apiVersion: batch/v1 kind: Job @@ -506,6 +573,66 @@ spec: path: /host/path/to/job/artifacts ``` +`jobmanager-application-ha.yaml` +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: flink-jobmanager +spec: + parallelism: 1 # Set the value to greater than 1 to start standby JobManagers + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + restartPolicy: OnFailure + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + # The following args overwrite the value of jobmanager.rpc.address configured in the configuration config map to POD_IP. + args: ["standalone-job", "--host", "$(POD_IP)", "--job-classname", "com.job.ClassName", , ] # optional arguments: ["--job-id", "", "--fromSavepoint", "/path/to/savepoint", "--allowNonRestoredState"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + - name: job-artifacts-volume + mountPath: /opt/flink/usrlib + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + serviceAccountName: flink-service-account # Service account which has the permissions to create, edit, delete ConfigMaps + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties + - name: job-artifacts-volume + hostPath: + path: /host/path/to/job/artifacts +``` + `taskmanager-job-deployment.yaml` ```yaml apiVersion: apps/v1