You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by da...@apache.org on 2023/07/06 17:26:19 UTC
[beam] branch master updated: Arc additional pools (#27369)
This is an automated email from the ASF dual-hosted git repository.
damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 9e6420d9f90 Arc additional pools (#27369)
9e6420d9f90 is described below
commit 9e6420d9f905092c7636f65df964af0ee1cb5c26
Author: Vlado Djerek <20...@users.noreply.github.com>
AuthorDate: Thu Jul 6 19:26:12 2023 +0200
Arc additional pools (#27369)
* adds additional pools object
* adding disk size and removing old variables
* changed default labelset for additional pools
* updated beam.env
* formatting
* add gcloud login to readme and sync environment
---
.../gh-actions-self-hosted-runners/arc/README.md | 44 +++++++++--
.../arc/config/arc_autoscaler.tpl | 4 +-
.../arc/config/arc_deployment.tpl | 35 ++++++---
.../arc/environments/beam.env | 42 ++++++++---
.github/gh-actions-self-hosted-runners/arc/gke.tf | 53 +++++++++++--
.github/gh-actions-self-hosted-runners/arc/helm.tf | 2 +-
.../arc/kubernetes.tf | 30 ++++++--
.../arc/variables.tf | 87 +++++++++++++++-------
8 files changed, 232 insertions(+), 65 deletions(-)
diff --git a/.github/gh-actions-self-hosted-runners/arc/README.md b/.github/gh-actions-self-hosted-runners/arc/README.md
index 5be4a93b43a..e5055826d00 100644
--- a/.github/gh-actions-self-hosted-runners/arc/README.md
+++ b/.github/gh-actions-self-hosted-runners/arc/README.md
@@ -38,8 +38,15 @@ All are created in the step before
project_id = "PROJECT_ID" # google PROJECT_ID that you want to deploy in
region = "gcp_region" # GCP region for the network
zone = "europe-west3-c" # GCP zone for the nodes
-min_main_node_count = "1" # Minimal and initial node count for main pool
-max_main_node_count = "5" # Maximal node count for main pool
+main_runner = {
+ name = "main-runner" # Main runner pool name
+ machine_type = "e2-standard-16" # Main runner pool machine type
+ min_node_count = "1" # Main runner pool minimal node count
+ max_node_count = "5" # Main runner pool maximal node count
+ min_replicas = "5" # Min number of runner PODs in the main pool . Do not confuse with Nodes
+ max_replicas = "20" # Max number of runner PODs in the main pool . Do not confuse with Nodes
+ webhook_scaling # Enable webhook scaling for main pool
+}
environment = "environment_name" # Name of the environment. Used as a prefix like dev- stag- anything-
ingress_domain = "fqdn" # FQDN for webhook ingress
organization = "org" # Github Organization to use runners in
@@ -48,15 +55,40 @@ github_app_id_secret_name = "app_id_secret_name" # Google secret na
github_app_install_id_secret_name = "install_id_secret_name" # Google secret name for install_id
github_private_key_secret_name = "pem_file_secret_name" # Google secret name for pem file
deploy_webhook = "false" # Terraform to deploy the scaling webhook
-max_main_replicas = "2" # Max number of runner PODs . Do not confuse with Nodes
-min_main_replicas = "1" # Min number of runner PODs . Do not confuse with Nodes
-webhook_scaling = "false" # Enable webhook scaling. When disabled runner busy percentage is used
#state_bucket_name = "state_bucket_name" # Not used by terraform. This is just to reference what bucket is used for others
```
+If you want to create additonal pools you can use the `additional_runner_pools` which is a list of objects. Example:
+```
+additional_runner_pools = [
+{
+name = "test-runner" # Pool name
+machine_type = "e2-standard-2" # Macihne type for the pool
+min_node_count = 1 # Minimal node count
+max_node_count = 2 # Maximal node count
+min_replicas = 1 # Minimal replica count
+min_replicas = 2 # Maximal replica count
+webhook_scaling = true # Enable webhook based scaling
+runner_image = "gcr.io/someimage:sometag" # Image to use
+labels = ["self-hosted", "testrunner"] # Label set for runner pool. Used in `on`
+enable_selector = "true" # Enables NodeSelector, forcing runners to this pool
+enable_taint = "true" # Enables Taints. Prevents other runner pods to run in this pool.
+requests = { # K8s cpu and memory requests
+ cpu = "500m" #
+ memory = "500mi"} #
+limits = { # K8s cpu and memory limits
+ cpu = "2" #
+ memory = "2Gi"}}] #
+
+```
+
+
+
5. Make sure you set the bucket name in the comment in the environment file for documentation purposes
-6. From this directory, init terraform with:
+6. From this directory, login to your gcloud account that you created the bucket with and init terraform with:
```
+gcloud auth login
+gcloud auth application-default login
terraform init -backend-config="bucket=bucket_name"
```
7. Terraform apply
diff --git a/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl b/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl
index 0de685c453b..f6da0aff038 100644
--- a/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl
+++ b/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl
@@ -19,12 +19,12 @@
apiVersion: actions.summerwind.dev/v1alpha1
kind: HorizontalRunnerAutoscaler
metadata:
- name: main-runners
+ name: ${name}
spec:
scaleDownDelaySecondsAfterScaleOut: 300
scaleTargetRef:
kind: RunnerDeployment
- name: main-runners
+ name: ${name}
minReplicas: ${min_runners}
maxReplicas: ${max_runners}
%{~ if webhook_scaling == "true" ~}
diff --git a/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl b/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl
index fd3803d522d..9280e4b77b3 100644
--- a/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl
+++ b/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl
@@ -19,21 +19,38 @@
apiVersion: actions.summerwind.dev/v1alpha1
kind: RunnerDeployment
metadata:
- name: main-runners
+ name: ${name}
spec:
template:
spec:
- image: summerwind/actions-runner:v2.304.0-ubuntu-20.04-30355f7
+ %{~ if selector == true ~}
+ nodeSelector:
+ runner-pool: ${name}
+ %{~ endif ~}
+ %{~ if taint == true ~}
+ tolerations:
+ - key: "runner-pool"
+ operator: "Equal"
+ value: ${name}
+ effect: "NoSchedule"
+ %{~ endif ~}
+ image: ${image}
organization: ${organization}
group: "${group}"
labels:
- - "ubuntu-20.04"
- - "self-hosted"
+ %{~ for label in labels ~}
+ - ${label}
+ %{~ endfor ~}
env: []
resources:
-# limits:
-# cpu: "4.0"
-# memory: "8Gi"
requests:
- cpu: "500m"
- memory: "500Mi"
+ cpu: ${requests.cpu}
+ memory: ${requests.memory}
+ limits:
+ %{~ if limits.cpu != "" ~}
+ cpu: ${limits.cpu}
+ %{~ if limits.memory != "" ~}
+ memory: ${limits.memory}
+ %{~ endif ~}
+ %{~ endif ~}
+
diff --git a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env
index 91b336ad10b..48566e940a7 100644
--- a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env
+++ b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env
@@ -18,10 +18,8 @@
#
project_id = "apache-beam-testing"
-region = "us-west1"
-zone = "us-west1-b"
-min_main_node_count = "1"
-max_main_node_count = "5"
+region = "us-central1"
+zone = "us-central1-b"
environment = "beam"
ingress_domain = "action.beam.apache.org"
organization = "apache"
@@ -30,9 +28,35 @@ github_app_id_secret_name = "gh-app_id"
github_app_install_id_secret_name = "gh-app_installation_id"
github_private_key_secret_name = "gh-pem_key"
deploy_webhook = "true"
-max_main_replicas = "50"
-min_main_replicas = "5"
-webhook_scaling = "true"
runner_group = "beam"
-machine_type = "e2-standard-16"
-#state_bucket_name = "beam-arc-state"
\ No newline at end of file
+main_runner = {
+ name = "main-runner"
+ machine_type = "e2-standard-16"
+ min_node_count = "1"
+ max_node_count = "7"
+ min_replicas = "1"
+ max_replicas = "45"
+ webhook_scaling = true
+ disk_size_gb = 200
+ requests = {
+ cpu = "2"
+ memory = "3Gi"
+ }
+}
+additional_runner_pools = [{
+ name = "small-runner"
+ machine_type = "e2-standard-2"
+ min_node_count = "1"
+ max_node_count = "10"
+ min_replicas = "1"
+ max_replicas = "10"
+ webhook_scaling = "true"
+ requests = {
+ cpu = "1500m"
+ memory = "5Gi"
+ }
+ labels = ["self-hosted", "ubuntu-20.04", "small"]
+ enable_selector = true
+ enable_taint = true
+}]
+#state_bucket_name = "beam-arc-state"
diff --git a/.github/gh-actions-self-hosted-runners/arc/gke.tf b/.github/gh-actions-self-hosted-runners/arc/gke.tf
index 4bf6f6c5a99..bfb04888557 100644
--- a/.github/gh-actions-self-hosted-runners/arc/gke.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/gke.tf
@@ -23,29 +23,72 @@ resource "google_container_cluster" "actions-runner-gke" {
initial_node_count = 1
network = google_compute_network.actions-runner-network.id
subnetwork = google_compute_subnetwork.actions-runner-subnetwork.id
- remove_default_node_pool = true
+ remove_default_node_pool = true
}
-resource "google_container_node_pool" "actions-runner-pool" {
+resource "google_container_node_pool" "main-actions-runner-pool" {
name = "main-pool"
cluster = google_container_cluster.actions-runner-gke.name
location = google_container_cluster.actions-runner-gke.location
autoscaling {
- min_node_count = var.min_main_node_count
- max_node_count = var.max_main_node_count
+ min_node_count = var.main_runner.min_node_count
+ max_node_count = var.main_runner.max_node_count
}
+ initial_node_count = var.main_runner.min_node_count
management {
auto_repair = "true"
auto_upgrade = "true"
}
node_config {
- machine_type = var.machine_type
+ disk_size_gb = var.main_runner.disk_size_gb
+ machine_type = var.main_runner.machine_type
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
tags = ["actions-runner-pool"]
}
}
+
+resource "google_container_node_pool" "additional_runner_pools" {
+ for_each = {
+ for index, runner_pool in var.additional_runner_pools : runner_pool.name => runner_pool
+ }
+
+ name = each.value.name
+ cluster = google_container_cluster.actions-runner-gke.name
+ location = google_container_cluster.actions-runner-gke.location
+ autoscaling {
+ min_node_count = each.value.min_node_count
+ max_node_count = each.value.max_node_count
+ }
+ initial_node_count = each.value.min_node_count
+ management {
+ auto_repair = "true"
+ auto_upgrade = "true"
+ }
+ node_config {
+ disk_size_gb = each.value.disk_size_gb
+ machine_type = each.value.machine_type
+ oauth_scopes = [
+ "https://www.googleapis.com/auth/cloud-platform"
+ ]
+ tags = ["actions-runner-pool"]
+ labels = {
+ "runner-pool" = each.value.name
+ }
+
+ dynamic "taint" {
+ for_each = each.value.enable_taint == true ? [1] : []
+ content {
+ key = "runner-pool"
+ value = each.value.name
+ effect = "NO_SCHEDULE"
+ }
+ }
+ }
+ }
+
+
resource "google_compute_global_address" "actions-runner-ip" {
name = "${var.environment}-actions-runner-ip"
}
\ No newline at end of file
diff --git a/.github/gh-actions-self-hosted-runners/arc/helm.tf b/.github/gh-actions-self-hosted-runners/arc/helm.tf
index 8b7d528dcc1..4c2badaf323 100644
--- a/.github/gh-actions-self-hosted-runners/arc/helm.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/helm.tf
@@ -30,7 +30,7 @@ resource "helm_release" "cert-manager" {
name = "installCRDs"
value = "true"
}
- depends_on = [ google_container_node_pool.actions-runner-pool ]
+ depends_on = [ google_container_node_pool.main-actions-runner-pool ]
}
resource "helm_release" "arc" {
diff --git a/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf b/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf
index 9622a0cab11..bafb653896d 100644
--- a/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf
@@ -17,17 +17,35 @@
# under the License.
#
resource "kubectl_manifest" "arc_deployment" {
- yaml_body = templatefile("config/arc_deployment.tpl", { organization = var.organization , group = var.runner_group})
+ yaml_body = templatefile("config/arc_deployment.tpl", { organization = var.organization, group = var.runner_group, name = var.main_runner.name, image = var.main_runner.runner_image, labels = var.main_runner.labels, selector = var.main_runner.enable_selector, taint = var.main_runner.enable_taint, requests = var.main_runner.requests, limits = var.main_runner.limits})
override_namespace = "arc"
- depends_on = [ helm_release.arc ]
+ depends_on = [helm_release.arc]
}
resource "kubectl_manifest" "arc_autoscaler" {
- yaml_body = templatefile("config/arc_autoscaler.tpl", { min_runners = var.min_main_replicas, max_runners = var.max_main_replicas , webhook_scaling = var.webhook_scaling})
+ yaml_body = templatefile("config/arc_autoscaler.tpl", { name = var.main_runner.name, min_runners = var.main_runner.min_replicas, max_runners = var.main_runner.max_replicas, webhook_scaling = var.main_runner.webhook_scaling })
override_namespace = "arc"
- depends_on = [ helm_release.arc ]
+ depends_on = [helm_release.arc]
}
resource "kubectl_manifest" "arc_webhook_certificate" {
- yaml_body = templatefile("config/arc_certificate.tpl", { ingress_domain = var.ingress_domain })
+ yaml_body = templatefile("config/arc_certificate.tpl", { ingress_domain = var.ingress_domain })
override_namespace = "arc"
- depends_on = [ helm_release.arc ]
+ depends_on = [helm_release.arc]
+}
+
+
+resource "kubectl_manifest" "arc_deployment_additional" {
+ for_each = {
+ for index, runner_pool in var.additional_runner_pools : runner_pool.name => runner_pool
+ }
+ yaml_body = templatefile("config/arc_deployment.tpl", { organization = var.organization, group = var.runner_group, name = each.value.name, image = each.value.runner_image, labels = each.value.labels, selector = each.value.enable_selector, taint = each.value.enable_taint , requests = each.value.requests, limits = each.value.limits})
+ override_namespace = "arc"
+ depends_on = [helm_release.arc]
+}
+resource "kubectl_manifest" "arc_autoscaler_additional" {
+ for_each = {
+ for index, runner_pool in var.additional_runner_pools : runner_pool.name => runner_pool
+ }
+ yaml_body = templatefile("config/arc_autoscaler.tpl", { name = each.value.name, min_runners = each.value.min_replicas, max_runners = each.value.max_replicas, webhook_scaling = each.value.webhook_scaling })
+ override_namespace = "arc"
+ depends_on = [helm_release.arc]
}
diff --git a/.github/gh-actions-self-hosted-runners/arc/variables.tf b/.github/gh-actions-self-hosted-runners/arc/variables.tf
index 81d6695e133..43f51938b7d 100644
--- a/.github/gh-actions-self-hosted-runners/arc/variables.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/variables.tf
@@ -27,28 +27,6 @@ variable "region" {
variable "zone" {
description = "Google Zone to use for deployment"
}
-variable "min_main_node_count" {
- description = "Minimal node count for GKE"
- default = "1"
-}
-variable "max_main_node_count" {
- description = "Maximal node count for GKE"
- default = "2"
-}
-variable "max_main_replicas" {
- description = "Maximal replicas for Action Runners"
- default = "2"
-
-}
-variable "min_main_replicas" {
- description = "Minimal replicas for Action Runners"
- default = "1"
-
-}
-variable machine_type {
- description = "Machine type to use for runner Node Pool"
- default = "e2-standard-2"
-}
variable "environment" {
description = "name of environment"
default = ""
@@ -84,8 +62,63 @@ variable "runner_group" {
description = "value for the runner group label"
default = ""
}
-variable "webhook_scaling" {
- description = "Enable scaling of runners based on webhook events"
- default = "false"
-
-}
+
+variable "main_runner" {
+ type = object({
+ name = string
+ machine_type = optional(string, "e2-standard-2")
+ min_node_count = optional(number, 1)
+ max_node_count = optional(number, 1)
+ min_replicas = optional(number, 1)
+ max_replicas = optional(number, 1)
+ disk_size_gb = optional(number, 100)
+ webhook_scaling = optional(bool, false)
+ runner_image = optional(string, "summerwind/actions-runner:v2.304.0-ubuntu-20.04-30355f7")
+ labels = optional(list(string), ["self-hosted", "ubuntu-20.04","main"])
+ enable_selector = optional(bool, false)
+ enable_taint = optional(bool, false)
+ requests = optional(object({
+ cpu = string
+ memory = string
+ }), { cpu = "500m",
+ memory = "500Mi"
+ })
+ limits = optional(object({
+ cpu = optional(string)
+ memory = optional(string)
+ }), {
+ cpu = "",
+ memory = ""
+ })
+ })
+}
+variable "additional_runner_pools" {
+ type = list(object({
+ name = string
+ machine_type = optional(string, "e2-standard-2")
+ min_node_count = optional(number, 1)
+ max_node_count = optional(number, 1)
+ min_replicas = optional(number, 1)
+ max_replicas = optional(number, 1)
+ disk_size_gb = optional(number, 100)
+ webhook_scaling = optional(bool, false)
+ runner_image = optional(string, "summerwind/actions-runner:v2.304.0-ubuntu-20.04-30355f7")
+ labels = optional(list(string), ["self-hosted", "ubuntu-20.04","changeme"])
+ enable_selector = optional(bool, true)
+ enable_taint = optional(bool, true)
+ requests = optional(object({
+ cpu = string
+ memory = string
+ }), { cpu = "500m",
+ memory = "500Mi"
+ })
+ limits = optional(object({
+ cpu = optional(string)
+ memory = optional(string)
+ }), {
+ cpu = "",
+ memory = ""
+ })
+ }))
+ default = []
+}
\ No newline at end of file