You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by da...@apache.org on 2023/07/06 17:26:19 UTC

[beam] branch master updated: Arc additional pools (#27369)

This is an automated email from the ASF dual-hosted git repository.

damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new 9e6420d9f90 Arc additional pools (#27369)
9e6420d9f90 is described below

commit 9e6420d9f905092c7636f65df964af0ee1cb5c26
Author: Vlado Djerek <20...@users.noreply.github.com>
AuthorDate: Thu Jul 6 19:26:12 2023 +0200

    Arc additional pools (#27369)
    
    * adds additional pools object
    
    * adding disk size and removing old variables
    
    * changed default labelset for additional pools
    
    * updated beam.env
    
    * formatting
    
    * add gcloud login to readme and sync environment
---
 .../gh-actions-self-hosted-runners/arc/README.md   | 44 +++++++++--
 .../arc/config/arc_autoscaler.tpl                  |  4 +-
 .../arc/config/arc_deployment.tpl                  | 35 ++++++---
 .../arc/environments/beam.env                      | 42 ++++++++---
 .github/gh-actions-self-hosted-runners/arc/gke.tf  | 53 +++++++++++--
 .github/gh-actions-self-hosted-runners/arc/helm.tf |  2 +-
 .../arc/kubernetes.tf                              | 30 ++++++--
 .../arc/variables.tf                               | 87 +++++++++++++++-------
 8 files changed, 232 insertions(+), 65 deletions(-)

diff --git a/.github/gh-actions-self-hosted-runners/arc/README.md b/.github/gh-actions-self-hosted-runners/arc/README.md
index 5be4a93b43a..e5055826d00 100644
--- a/.github/gh-actions-self-hosted-runners/arc/README.md
+++ b/.github/gh-actions-self-hosted-runners/arc/README.md
@@ -38,8 +38,15 @@ All are created in the step before
 project_id = "PROJECT_ID"                                     # google PROJECT_ID that you want to deploy in
 region = "gcp_region"                                         # GCP region for the network
 zone = "europe-west3-c"                                       # GCP zone for the nodes
-min_main_node_count = "1"                                     # Minimal and initial node count for main pool
-max_main_node_count = "5"                                     # Maximal node count for main pool
+main_runner = {
+    name = "main-runner"                                      # Main runner pool name
+    machine_type = "e2-standard-16"                           # Main runner pool machine type
+    min_node_count = "1"                                      # Main runner pool minimal node count
+    max_node_count = "5"                                      # Main runner pool maximal node count
+    min_replicas = "5"                                        # Min number of runner PODs in the main pool . Do not confuse with Nodes
+    max_replicas = "20"                                       # Max number of runner PODs in the main pool . Do not confuse with Nodes
+    webhook_scaling                                           # Enable webhook scaling for main pool
+}
 environment = "environment_name"                              # Name of the environment. Used as a prefix like dev- stag- anything-
 ingress_domain = "fqdn"                                       # FQDN for webhook ingress
 organization = "org"                                          # Github Organization to use runners in
@@ -48,15 +55,40 @@ github_app_id_secret_name = "app_id_secret_name"              # Google secret na
 github_app_install_id_secret_name = "install_id_secret_name"  # Google secret name for install_id
 github_private_key_secret_name = "pem_file_secret_name"       # Google secret name for pem file
 deploy_webhook = "false"                                      # Terraform to deploy the scaling webhook
-max_main_replicas = "2"                                       # Max number of runner PODs . Do not confuse with Nodes
-min_main_replicas = "1"                                       # Min number of runner PODs . Do not confuse with Nodes
-webhook_scaling = "false"                                     # Enable webhook scaling. When disabled runner busy percentage is used
 #state_bucket_name = "state_bucket_name"                      # Not used by terraform. This is just to reference what bucket is used for others
 ```
+If you want to create additonal pools you can use the `additional_runner_pools` which is a list of objects. Example:
+```
+additional_runner_pools = [
+{
+name = "test-runner"                      # Pool name
+machine_type = "e2-standard-2"            # Macihne type for the pool
+min_node_count = 1                        # Minimal node count
+max_node_count = 2                        # Maximal node count
+min_replicas = 1                          # Minimal replica count
+min_replicas = 2                          # Maximal replica count
+webhook_scaling = true                    # Enable webhook based scaling
+runner_image = "gcr.io/someimage:sometag" # Image to use
+labels = ["self-hosted", "testrunner"]    # Label set for runner pool. Used in `on`
+enable_selector = "true"                  # Enables NodeSelector, forcing runners to this pool
+enable_taint = "true"                     # Enables Taints. Prevents other runner pods to run in this pool.
+requests = {                              # K8s cpu and memory requests
+  cpu = "500m"                            #
+  memory = "500mi"}                       #
+limits = {                                # K8s cpu and memory limits
+    cpu = "2"                             #
+    memory = "2Gi"}}]                     #
+
+```
+
+
+
 5. Make sure you set the bucket name in the comment in the environment file for documentation purposes
 
-6.  From this directory, init terraform with:
+6.  From this directory, login to your gcloud account that you created the bucket with and  init terraform with:
 ```
+gcloud auth login
+gcloud auth application-default login
 terraform init -backend-config="bucket=bucket_name"
 ```
 7. Terraform apply
diff --git a/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl b/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl
index 0de685c453b..f6da0aff038 100644
--- a/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl
+++ b/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl
@@ -19,12 +19,12 @@
 apiVersion: actions.summerwind.dev/v1alpha1
 kind: HorizontalRunnerAutoscaler
 metadata:
-  name: main-runners
+  name: ${name}
 spec:
   scaleDownDelaySecondsAfterScaleOut: 300
   scaleTargetRef:
     kind: RunnerDeployment
-    name: main-runners
+    name: ${name}
   minReplicas: ${min_runners}
   maxReplicas: ${max_runners}
   %{~ if webhook_scaling == "true" ~}
diff --git a/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl b/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl
index fd3803d522d..9280e4b77b3 100644
--- a/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl
+++ b/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl
@@ -19,21 +19,38 @@
 apiVersion: actions.summerwind.dev/v1alpha1
 kind: RunnerDeployment
 metadata:
-  name: main-runners
+  name: ${name}
 spec:
   template:
     spec:
-      image: summerwind/actions-runner:v2.304.0-ubuntu-20.04-30355f7
+      %{~ if selector == true  ~}
+      nodeSelector:
+        runner-pool: ${name} 
+      %{~ endif ~}
+      %{~ if taint == true  ~}
+      tolerations:
+        - key: "runner-pool"
+          operator: "Equal"
+          value: ${name}
+          effect: "NoSchedule"
+      %{~ endif ~}
+      image: ${image}
       organization: ${organization}
       group: "${group}"
       labels:
-        - "ubuntu-20.04"
-        - "self-hosted"
+      %{~ for label in labels ~}
+        - ${label}
+      %{~ endfor ~}
       env: []
       resources:
-#        limits:
-#          cpu: "4.0"
-#          memory: "8Gi"
         requests:
-          cpu: "500m"
-          memory: "500Mi"
+          cpu: ${requests.cpu}
+          memory: ${requests.memory}
+        limits:
+      %{~ if limits.cpu != "" ~}
+          cpu: ${limits.cpu}
+      %{~ if limits.memory != "" ~}
+          memory: ${limits.memory}
+      %{~ endif ~}
+      %{~ endif ~}
+
diff --git a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env
index 91b336ad10b..48566e940a7 100644
--- a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env
+++ b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env
@@ -18,10 +18,8 @@
 #
 
 project_id = "apache-beam-testing"
-region = "us-west1"
-zone = "us-west1-b"
-min_main_node_count = "1"
-max_main_node_count = "5"
+region = "us-central1"
+zone = "us-central1-b"
 environment = "beam"
 ingress_domain = "action.beam.apache.org" 
 organization = "apache"
@@ -30,9 +28,35 @@ github_app_id_secret_name = "gh-app_id"
 github_app_install_id_secret_name = "gh-app_installation_id"
 github_private_key_secret_name = "gh-pem_key"
 deploy_webhook = "true"
-max_main_replicas = "50"
-min_main_replicas = "5"
-webhook_scaling = "true"
 runner_group = "beam"
-machine_type = "e2-standard-16"
-#state_bucket_name = "beam-arc-state"
\ No newline at end of file
+main_runner = {
+    name = "main-runner"
+    machine_type = "e2-standard-16"
+    min_node_count = "1"
+    max_node_count = "7"
+    min_replicas = "1"
+    max_replicas = "45"
+    webhook_scaling = true
+    disk_size_gb = 200
+    requests = {
+        cpu = "2"
+        memory = "3Gi"
+    }
+}
+additional_runner_pools = [{
+    name = "small-runner"
+    machine_type = "e2-standard-2"
+    min_node_count = "1"
+    max_node_count = "10"
+    min_replicas = "1"
+    max_replicas = "10"
+    webhook_scaling = "true"
+    requests = {
+        cpu = "1500m"
+        memory = "5Gi"
+    }
+    labels = ["self-hosted", "ubuntu-20.04", "small"]
+    enable_selector = true
+    enable_taint = true
+}]
+#state_bucket_name = "beam-arc-state"
diff --git a/.github/gh-actions-self-hosted-runners/arc/gke.tf b/.github/gh-actions-self-hosted-runners/arc/gke.tf
index 4bf6f6c5a99..bfb04888557 100644
--- a/.github/gh-actions-self-hosted-runners/arc/gke.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/gke.tf
@@ -23,29 +23,72 @@ resource "google_container_cluster" "actions-runner-gke" {
   initial_node_count         = 1
   network                    = google_compute_network.actions-runner-network.id
   subnetwork                 = google_compute_subnetwork.actions-runner-subnetwork.id
-  remove_default_node_pool = true
+  remove_default_node_pool   = true
 
 }
-resource "google_container_node_pool" "actions-runner-pool" {
+resource "google_container_node_pool" "main-actions-runner-pool" {
   name       = "main-pool"
   cluster    = google_container_cluster.actions-runner-gke.name
   location   = google_container_cluster.actions-runner-gke.location
   autoscaling {
-    min_node_count = var.min_main_node_count
-    max_node_count = var.max_main_node_count
+    min_node_count = var.main_runner.min_node_count
+    max_node_count = var.main_runner.max_node_count
    }
+   initial_node_count = var.main_runner.min_node_count
   management {
     auto_repair  = "true"
     auto_upgrade = "true"
    }
   node_config {
-    machine_type    = var.machine_type
+    disk_size_gb = var.main_runner.disk_size_gb
+    machine_type = var.main_runner.machine_type
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform"
     ]
     tags = ["actions-runner-pool"]
    }
 }
+
+resource "google_container_node_pool" "additional_runner_pools" {
+  for_each = {
+    for index, runner_pool in var.additional_runner_pools : runner_pool.name => runner_pool
+  }
+
+  name       = each.value.name
+  cluster    = google_container_cluster.actions-runner-gke.name
+  location   = google_container_cluster.actions-runner-gke.location
+  autoscaling {
+    min_node_count = each.value.min_node_count
+    max_node_count = each.value.max_node_count
+   }
+   initial_node_count = each.value.min_node_count
+  management {
+    auto_repair  = "true"
+    auto_upgrade = "true"
+   }
+  node_config {
+    disk_size_gb = each.value.disk_size_gb
+    machine_type    = each.value.machine_type
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+    tags = ["actions-runner-pool"]
+    labels = {
+      "runner-pool" = each.value.name
+    }
+   
+    dynamic "taint" {
+      for_each = each.value.enable_taint == true ? [1] : []
+      content {
+        key    = "runner-pool"
+        value  = each.value.name
+        effect = "NO_SCHEDULE"
+        }
+      }
+    }
+  }
+
+
 resource "google_compute_global_address" "actions-runner-ip" {
   name      = "${var.environment}-actions-runner-ip"
 }
\ No newline at end of file
diff --git a/.github/gh-actions-self-hosted-runners/arc/helm.tf b/.github/gh-actions-self-hosted-runners/arc/helm.tf
index 8b7d528dcc1..4c2badaf323 100644
--- a/.github/gh-actions-self-hosted-runners/arc/helm.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/helm.tf
@@ -30,7 +30,7 @@ resource "helm_release" "cert-manager" {
     name  = "installCRDs"
     value = "true"
   }
-  depends_on = [ google_container_node_pool.actions-runner-pool ]
+  depends_on = [ google_container_node_pool.main-actions-runner-pool ]
 }
 
 resource "helm_release" "arc" {
diff --git a/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf b/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf
index 9622a0cab11..bafb653896d 100644
--- a/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf
@@ -17,17 +17,35 @@
 # under the License.
 #
 resource "kubectl_manifest" "arc_deployment" {
-  yaml_body = templatefile("config/arc_deployment.tpl", { organization = var.organization , group = var.runner_group})
+  yaml_body          = templatefile("config/arc_deployment.tpl", { organization = var.organization, group = var.runner_group, name = var.main_runner.name, image = var.main_runner.runner_image, labels = var.main_runner.labels, selector = var.main_runner.enable_selector, taint = var.main_runner.enable_taint, requests = var.main_runner.requests, limits = var.main_runner.limits})
   override_namespace = "arc"
-  depends_on = [ helm_release.arc ]
+  depends_on         = [helm_release.arc]
 }
 resource "kubectl_manifest" "arc_autoscaler" {
-  yaml_body = templatefile("config/arc_autoscaler.tpl", { min_runners = var.min_main_replicas, max_runners = var.max_main_replicas , webhook_scaling = var.webhook_scaling})
+  yaml_body          = templatefile("config/arc_autoscaler.tpl", { name = var.main_runner.name, min_runners = var.main_runner.min_replicas, max_runners = var.main_runner.max_replicas, webhook_scaling = var.main_runner.webhook_scaling })
   override_namespace = "arc"
-  depends_on = [ helm_release.arc ]
+  depends_on         = [helm_release.arc]
 }
 resource "kubectl_manifest" "arc_webhook_certificate" {
-  yaml_body = templatefile("config/arc_certificate.tpl", { ingress_domain = var.ingress_domain })
+  yaml_body          = templatefile("config/arc_certificate.tpl", { ingress_domain = var.ingress_domain })
   override_namespace = "arc"
-  depends_on = [ helm_release.arc ]
+  depends_on         = [helm_release.arc]
+}
+
+
+resource "kubectl_manifest" "arc_deployment_additional" {
+  for_each = {
+    for index, runner_pool in var.additional_runner_pools : runner_pool.name => runner_pool
+  }
+  yaml_body          = templatefile("config/arc_deployment.tpl", { organization = var.organization, group = var.runner_group, name = each.value.name, image = each.value.runner_image, labels = each.value.labels, selector = each.value.enable_selector, taint = each.value.enable_taint , requests = each.value.requests, limits = each.value.limits})
+  override_namespace = "arc"
+  depends_on         = [helm_release.arc]
+}
+resource "kubectl_manifest" "arc_autoscaler_additional" {
+  for_each = {
+    for index, runner_pool in var.additional_runner_pools : runner_pool.name => runner_pool
+  }
+  yaml_body          = templatefile("config/arc_autoscaler.tpl", { name = each.value.name, min_runners = each.value.min_replicas, max_runners = each.value.max_replicas, webhook_scaling = each.value.webhook_scaling })
+  override_namespace = "arc"
+  depends_on         = [helm_release.arc]
 }
diff --git a/.github/gh-actions-self-hosted-runners/arc/variables.tf b/.github/gh-actions-self-hosted-runners/arc/variables.tf
index 81d6695e133..43f51938b7d 100644
--- a/.github/gh-actions-self-hosted-runners/arc/variables.tf
+++ b/.github/gh-actions-self-hosted-runners/arc/variables.tf
@@ -27,28 +27,6 @@ variable "region" {
 variable "zone" {
     description = "Google Zone to use for deployment"
 }
-variable "min_main_node_count" {
-    description = "Minimal node count for GKE"
-    default = "1"
-}
-variable "max_main_node_count" {
-    description = "Maximal node count for GKE"
-    default = "2"
-}
-variable "max_main_replicas" {
-    description = "Maximal replicas for Action Runners"
-    default = "2"
-  
-}
-variable "min_main_replicas" {
-    description = "Minimal replicas for Action Runners"
-    default = "1"
-  
-}
-variable machine_type {
-    description = "Machine type to use for runner Node Pool"
-    default = "e2-standard-2"
-}
 variable "environment" {
     description = "name of environment"
     default = ""
@@ -84,8 +62,63 @@ variable "runner_group" {
   description = "value for the runner group label"
   default = ""
 }
-variable "webhook_scaling" {
-    description = "Enable scaling of runners based on webhook events"
-    default = "false"
-  
-}
+
+variable "main_runner" {
+    type = object({
+      name = string
+      machine_type = optional(string, "e2-standard-2")
+      min_node_count = optional(number, 1)
+      max_node_count = optional(number, 1)
+      min_replicas = optional(number, 1)
+      max_replicas = optional(number, 1)
+      disk_size_gb = optional(number, 100)
+      webhook_scaling = optional(bool, false)
+      runner_image = optional(string, "summerwind/actions-runner:v2.304.0-ubuntu-20.04-30355f7")
+      labels = optional(list(string), ["self-hosted", "ubuntu-20.04","main"])
+      enable_selector = optional(bool, false)
+      enable_taint = optional(bool, false)
+      requests = optional(object({
+        cpu = string
+        memory = string
+        }), { cpu = "500m",
+              memory = "500Mi" 
+        })
+      limits = optional(object({
+        cpu = optional(string)
+        memory = optional(string)
+        }), {
+            cpu = "",
+            memory = ""
+        })
+    })
+}
+variable "additional_runner_pools" {
+    type = list(object({
+      name = string
+      machine_type = optional(string, "e2-standard-2")
+      min_node_count = optional(number, 1)
+      max_node_count = optional(number, 1)
+      min_replicas = optional(number, 1)
+      max_replicas = optional(number, 1)
+      disk_size_gb = optional(number, 100)
+      webhook_scaling = optional(bool, false)
+      runner_image = optional(string, "summerwind/actions-runner:v2.304.0-ubuntu-20.04-30355f7")
+      labels = optional(list(string), ["self-hosted", "ubuntu-20.04","changeme"])
+      enable_selector = optional(bool, true)
+      enable_taint = optional(bool, true)
+      requests = optional(object({
+        cpu = string
+        memory = string
+        }), { cpu = "500m",
+              memory = "500Mi" 
+        })
+      limits = optional(object({
+        cpu = optional(string)
+        memory = optional(string)
+        }), {
+            cpu = "",
+            memory = ""
+        })
+    }))
+    default = []
+}
\ No newline at end of file