Skip to content

Commit

Permalink
[tf][testnet] enable eks autoscaler
Browse files Browse the repository at this point in the history
  • Loading branch information
rustielin authored and aptos-bot committed May 5, 2022
1 parent ef8090b commit 768f544
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 1 deletion.
176 changes: 176 additions & 0 deletions terraform/helm/k8s-metrics/templates/autoscaler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
{{- if .Values.autoscaler.enabled }}
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
name: cluster-autoscaler
namespace: kube-system
annotations:
{{- toYaml .Values.autoscaler.serviceAccount.annotations | nindent 4 }}

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources:
- "namespaces"
- "pods"
- "services"
- "replicationcontrollers"
- "persistentvolumeclaims"
- "persistentvolumes"
verbs: ["watch", "list", "get"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources:
["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"]
verbs: ["watch", "list", "get"]
- apiGroups: ["batch", "extensions"]
resources: ["jobs"]
verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resourceNames: ["cluster-autoscaler"]
resources: ["leases"]
verbs: ["get", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "list", "watch"]
- apiGroups: [""]
resources: ["configmaps"]
resourceNames:
["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"]
verbs: ["delete", "get", "update", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system

---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system

---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
app: cluster-autoscaler
spec:
replicas: 1
selector:
matchLabels:
app: cluster-autoscaler
template:
metadata:
labels:
app: cluster-autoscaler
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8085"
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"
spec:
priorityClassName: system-cluster-critical
securityContext:
runAsNonRoot: true
runAsUser: 65534
fsGroup: 65534
serviceAccountName: cluster-autoscaler
containers:
- image: {{ .Values.autoscaler.image.repo }}:{{ .Values.autoscaler.image.tag }}
name: cluster-autoscaler
resources:
{{- toYaml .Values.resources | nindent 12 }}
command:
- ./cluster-autoscaler
- --v=4
- --stderrthreshold=info
- --cloud-provider=aws
- --skip-nodes-with-local-storage=false
- --expander=least-waste
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ .Values.autoscaler.clusterName }}
- --balance-similar-node-groups
- --skip-nodes-with-system-pods=false
volumeMounts:
- name: ssl-certs
mountPath: /etc/ssl/certs/ca-certificates.crt #/etc/ssl/certs/ca-bundle.crt for Amazon Linux Worker Nodes
readOnly: true
imagePullPolicy: "Always"
volumes:
- name: ssl-certs
hostPath:
path: "/etc/ssl/certs/ca-bundle.crt"
{{- end }}
16 changes: 16 additions & 0 deletions terraform/helm/k8s-metrics/values.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
coredns:
maxReplicas: 2
minReplicas: 2

autoscaler:
enabled:
clusterName:
image:
repo: k8s.gcr.io/autoscaling/cluster-autoscaler
tag: v1.21.0
resources:
limits:
cpu: 100m
memory: 600Mi
requests:
cpu: 100m
memory: 600Mi
serviceAccount:
annotations:
87 changes: 87 additions & 0 deletions terraform/testnet/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,93 @@ resource "helm_release" "metrics-server" {
maxReplicas = var.num_validators
minReplicas = var.coredns_min_replicas
}
autoscaler = {
enabled = var.enable_cluster_autoscaler
clusterName = data.aws_eks_cluster.aptos.name
image = {
# EKS does not report patch version
tag = "v${data.aws_eks_cluster.aptos.version}.0"
}
serviceAccount = {
annotations = {
"eks.amazonaws.com/role-arn" = aws_iam_role.cluster-autoscaler[0].arn
}
}
}
})
]
}


# access control
data "aws_iam_policy_document" "cluster-autoscaler-assume-role" {
count = var.enable_cluster_autoscaler ? 1 : 0
statement {
actions = ["sts:AssumeRoleWithWebIdentity"]

principals {
type = "Federated"
identifiers = [
"arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${module.validator.oidc_provider}"
]
}

condition {
test = "StringEquals"
variable = "${module.validator.oidc_provider}:sub"
# the name of the kube-system cluster-autoscaler service account
values = ["system:serviceaccount:kube-system:cluster-autoscaler"]
}

condition {
test = "StringEquals"
variable = "${module.validator.oidc_provider}:aud"
values = ["sts.amazonaws.com"]
}
}
}

data "aws_iam_policy_document" "cluster-autoscaler" {
count = var.enable_cluster_autoscaler ? 1 : 0

statement {
sid = "Autoscaling"
actions = [
"autoscaling:SetDesiredCapacity",
"autoscaling:TerminateInstanceInAutoScalingGroup"
]
resources = ["*"]
condition {
test = "StringEquals"
variable = "aws:ResourceTag/k8s.io/cluster-autoscaler/${data.aws_eks_cluster.aptos.name}"
values = ["owned"]
}
}

statement {
sid = "DescribeAutoscaling"
actions = [
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DescribeAutoScalingGroups",
"ec2:DescribeLaunchTemplateVersions",
"autoscaling:DescribeTags",
"autoscaling:DescribeLaunchConfigurations"
]
resources = ["*"]
}
}

resource "aws_iam_role" "cluster-autoscaler" {
count = var.enable_cluster_autoscaler ? 1 : 0
name = "aptos-testnet-${terraform.workspace}-cluster-autoscaler"
path = var.iam_path
permissions_boundary = var.permissions_boundary_policy
assume_role_policy = data.aws_iam_policy_document.cluster-autoscaler-assume-role[0].json
}

resource "aws_iam_role_policy" "cluster-autoscaler" {
count = var.enable_cluster_autoscaler ? 1 : 0
name = "Helm"
role = aws_iam_role.cluster-autoscaler[0].name
policy = data.aws_iam_policy_document.cluster-autoscaler[0].json
}
5 changes: 5 additions & 0 deletions terraform/testnet/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ variable "enable_k8s_metrics_server" {
default = false
}

variable "enable_cluster_autoscaler" {
description = "Enable cluster autoscaler: https://docs.aws.amazon.com/eks/latest/userguide/autoscaling.html"
default = false
}

variable "coredns_min_replicas" {
description = "Minimal replica numbers for core dns"
default = 2
Expand Down
2 changes: 1 addition & 1 deletion terraform/validator/aws/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ resource "aws_eks_node_group" "nodes" {

scaling_config {
desired_size = lookup(var.node_pool_sizes, each.key, each.value.size)
min_size = lookup(var.node_pool_sizes, each.key, each.value.size)
min_size = 1
max_size = lookup(var.node_pool_sizes, each.key, each.value.size) * var.max_node_pool_surge
}

Expand Down

0 comments on commit 768f544

Please sign in to comment.