You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2021/05/02 16:55:45 UTC

[airflow-ci-infra] 01/01: Cloud init is adapted to run GCE instance

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch add-gcp-cloud-init
in repository https://gitbox.apache.org/repos/asf/airflow-ci-infra.git

commit e20c7b07d10d012f12ea974e3c5656fabf85005a
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Sun May 2 18:54:44 2021 +0200

    Cloud init is adapted to run GCE instance
---
 cloud-init.yml                                     | 66 +++++++++++++++++-----
 gcp/README.md                                      | 45 +++++++++++++++
 gcp/metrics/gcp_create_metrics_descriptor.py       | 56 ++++++++++++++++++
 .../metrics/gcp_delete_metrics_descriptor.py       | 23 +++++++-
 gcp/metrics/gcp_write_metrics_data.py              | 57 +++++++++++++++++++
 lambdas/scale_out_runner/app.py                    |  7 ++-
 lambdas/scale_out_runner/requirements.txt          |  1 +
 requirements.txt                                   |  2 +
 scripts/runner-supervisor.py                       |  2 +-
 9 files changed, 240 insertions(+), 19 deletions(-)

diff --git a/cloud-init.yml b/cloud-init.yml
index 05d71d5..af14d2c 100644
--- a/cloud-init.yml
+++ b/cloud-init.yml
@@ -39,14 +39,21 @@ runcmd:
     - -c
     - |
       set -eu -o pipefail
-      echo "AWS_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment
+      if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+          echo "AWS_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment
+      fi
+      if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+          echo "GCP_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment
+      fi
       # Set an env var (that is visible in runners) that will let us know we are on a self-hosted runner
       echo 'AIRFLOW_SELF_HOSTED_RUNNER="[\"self-hosted\"]"' >> /etc/environment
       set -a
       . /etc/environment
       set +a
-      echo "ASG_GROUP_NAME=$(aws ec2 describe-tags --filter Name=resource-id,Values=$(cloud-init query instance_id) Name=key,Values=aws:autoscaling:groupName \
-            | jq -r '@sh "\(.Tags[0].Value)"')" >> /etc/environment
+      if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+          echo "ASG_GROUP_NAME=$(aws ec2 describe-tags --filter Name=resource-id,Values=$(cloud-init query instance_id) Name=key,Values=aws:autoscaling:groupName \
+              | jq -r '@sh "\(.Tags[0].Value)"')" >> /etc/environment
+      fi
   - [systemctl, daemon-reload]
   -
     - bash
@@ -75,10 +82,27 @@ runcmd:
       . /etc/environment
       set +a
 
-      aws s3 cp s3://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor
+      if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+          aws s3 cp s3://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor
+      fi
+      if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+          gsutil cp gs://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor
+      fi
       chmod 755 /opt/runner-supervisor/bin/runner-supervisor
-
     - 2.277.1-airflow3
+  -
+    - bash
+    - -c
+    - |
+      set -eu -o pipefail
+      if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+          gsutil cp gs://airflow-ci-assets/requirements.txt /opt/requirements.txt
+          python3 -mvenv /opt/gcp-metrics-writer
+          /opt/gcp-metrics-writer/bin/pip install -r /opt/requirements.txt
+          gsutil cp gs://airflow-ci-assets/gcp_write_metrics_data.py /opt/gcp-metrics-writer/bin/gcp_write_metrics_data
+          chmod a+x /opt/gcp-metrics-writer/bin/gcp_write_metrics_data
+      fi
+
   - [systemctl, enable, --now, iptables.service]
   # Restart docker after applying the user firewall -- else some rules/chains might be list!
   - [systemctl, restart, docker.service]
@@ -120,9 +144,15 @@ write_files:
       docker ps -qa | xargs --verbose --no-run-if-empty docker rm -fv
 
       echo "Log in to a paid docker user to get unlimited docker pulls"
-      aws ssm get-parameter --with-decryption --name /runners/apache/airflow/dockerPassword | \
-        jq .Parameter.Value -r | \
-        sudo -u runner docker login --username airflowcirunners --password-stdin
+      if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+          aws ssm get-parameter --with-decryption --name /runners/apache/airflow/dockerPassword | \
+            jq .Parameter.Value -r | \
+            sudo -u runner docker login --username airflowcirunners --password-stdin
+      fi
+      if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+          gcloud secrets versions access latest --secret "runners-apache-airflow-dockerPassword" | \
+            sudo -u runner docker login --username airflowcirunners --password-stdin
+      fi
 
       if [[ -d ~runner/actions-runner/_work/airflow/airflow ]]; then
         cd ~runner/actions-runner/_work/airflow/airflow
@@ -134,7 +164,8 @@ write_files:
             git submodule deinit --all -f && \
             git submodule foreach git clean -fxd && \
             git clean -fxd \
-          "
+
+
         fi
       fi
     owner: root:root
@@ -176,7 +207,11 @@ write_files:
       runner ALL=(ALL) NOPASSWD:/usr/sbin/swapoff -a, /usr/bin/rm -f /swapfile, /usr/bin/apt clean
   - path: /etc/iptables/rules.v4
     content: |
+      #
       # Generated by iptables-save v1.8.4 on Thu Jan 14 13:59:27 2021
+      # The Metadata server IP address is the same for AWS and GCP: 169.254.169.254
+      # Which is pretty cool.
+      #
       *filter
       :INPUT ACCEPT [833:75929]
       :FORWARD DROP [0:0]
@@ -188,18 +223,23 @@ write_files:
       -A DOCKER-USER -j RETURN
       COMMIT
 
-  - path: /usr/local/sbin/actions-runner-ec2-reporting
+  - path: /usr/local/sbin/actions-runner-reporting
     permissions: '0775'
     content: |
       #!/bin/bash
       set -eu -o pipefail
       if pgrep -c Runner.Worker >/dev/null; then
           # Only report metric when we're doing something -- no point paying to submit zeros
-          aws cloudwatch put-metric-data --metric-name jobs-running --value "$(pgrep -c Runner.Worker)" --namespace github.actions
+          if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+             aws cloudwatch put-metric-data --metric-name jobs-running --value "$(pgrep -c Runner.Worker)" --namespace github.actions
+          fi
+          if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+             /opt/gcp-metrics-writer/bin/python /opt/gcp-metrics-writer/bin/gcp_write_metrics_data --value "$(pgrep -c Runner.Worker)"
+          fi
       fi
-  - path: /etc/cron.d/cloudwatch-metrics-github-runners
+  - path: /etc/cron.d/metrics-github-runners
     content: |
-      */1 * * * * nobody /usr/local/sbin/actions-runner-ec2-reporting
+      */1 * * * * nobody /usr/local/sbin/actions-runner-reporting
 
   - path: /etc/systemd/system/actions.runner-supervisor.service
     content: |
diff --git a/gcp/README.md b/gcp/README.md
new file mode 100644
index 0000000..ca74299
--- /dev/null
+++ b/gcp/README.md
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Notes from setting up GCP version of Airflow CI runner
+
+Those are notes taken while setting-up GCP version of the Runner.
+
+1. Created a new service account without any permissions
+
+airlfow-ci-runner@apache-airflow-ci-cd.iam.gserviceaccount.com
+
+2. Created custom roles with those permissions:
+
+* Monitoring Metric Writer
+    * monitoring.timeSeries.create
+
+2. Created `runners-apache-airflow-dockerPassword` secret with the same value as in AWS.
+
+3. Assigned roles to the "airflow-ci-runner" service account:
+
+* Monitoring Metric Writer
+* Secret Manager Secret Accessor
+
+
+4. Created `airflow-ci-assets` GCS bucket with "public read" permissions
+
+5. Copied those files there (they need to be copied every time they are changed)
+   * gcp_write_metrics_data.py
+   * get-runner-creds.py
+   * requirements.txt
+   * runner-supervisor.py
diff --git a/gcp/metrics/gcp_create_metrics_descriptor.py b/gcp/metrics/gcp_create_metrics_descriptor.py
new file mode 100755
index 0000000..fda60be
--- /dev/null
+++ b/gcp/metrics/gcp_create_metrics_descriptor.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import click
+from google.api import label_pb2 as ga_label, metric_pb2 as ga_metric
+from google.cloud import monitoring_v3
+
+DEFAULT_PROJECT = 'apache-airflow-ci-cd'
+DEFAULT_ZONE = 'us-central1-a'
+CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running'
+
+
+@click.command()
+@click.option('--project', default=DEFAULT_PROJECT)
+def main(project: str):
+    client = monitoring_v3.MetricServiceClient()
+    project_name = f"projects/{project}"
+    descriptor = ga_metric.MetricDescriptor()
+    descriptor.display_name = "GitHub Actions jobs"
+    descriptor.type = CUSTOM_METRICS_TYPE
+    descriptor.metric_kind = ga_metric.MetricDescriptor.MetricKind.GAUGE
+    descriptor.value_type = ga_metric.MetricDescriptor.ValueType.INT64
+    descriptor.description = "Number of Jobs running for GitHub Actions."
+
+    label_instance_id = ga_label.LabelDescriptor()
+    label_instance_id.key = "instance_id"
+    label_instance_id.value_type = ga_label.LabelDescriptor.ValueType.STRING
+    label_instance_id.description = "The instance_id"
+    label_zone = ga_label.LabelDescriptor()
+    label_zone.key = "zone"
+    label_zone.value_type = ga_label.LabelDescriptor.ValueType.STRING
+    label_zone.description = "The zone"
+    descriptor.labels.append(label_instance_id)
+    descriptor.labels.append(label_zone)
+
+    descriptor = client.create_metric_descriptor(name=project_name, metric_descriptor=descriptor)
+    print(f"Created {descriptor.name}.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/lambdas/scale_out_runner/requirements.txt b/gcp/metrics/gcp_delete_metrics_descriptor.py
old mode 100644
new mode 100755
similarity index 54%
copy from lambdas/scale_out_runner/requirements.txt
copy to gcp/metrics/gcp_delete_metrics_descriptor.py
index c5402f0..c7e6a0f
--- a/lambdas/scale_out_runner/requirements.txt
+++ b/gcp/metrics/gcp_delete_metrics_descriptor.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,5 +16,23 @@
 # specific language governing permissions and limitations
 # under the License.
 
-boto3
-chalice
+import click
+from google.cloud import monitoring_v3
+
+DEFAULT_PROJECT = 'apache-airflow-ci-cd'
+DEFAULT_ZONE = 'us-central1-a'
+CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running'
+
+
+@click.command()
+@click.option('--project', default=DEFAULT_PROJECT)
+@click.option('--zone', default=DEFAULT_ZONE)
+def main(project: str, zone: str):
+    client = monitoring_v3.MetricServiceClient()
+    descriptor_name = f"projects/{project}/metricDescriptors/{CUSTOM_METRICS_TYPE}"
+    client.delete_metric_descriptor(name=descriptor_name)
+    print(f"Deleted metric descriptor {descriptor_name}.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gcp/metrics/gcp_write_metrics_data.py b/gcp/metrics/gcp_write_metrics_data.py
new file mode 100755
index 0000000..56020f6
--- /dev/null
+++ b/gcp/metrics/gcp_write_metrics_data.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import time
+
+import click
+import requests
+from google.cloud import monitoring_v3
+
+DEFAULT_PROJECT = 'apache-airflow-ci-cd'
+DEFAULT_ZONE = 'us-central1-a'
+CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running'
+
+
+@click.command()
+@click.option('--project', default=DEFAULT_PROJECT)
+@click.option('--instance')
+@click.option('--value', type=int, default=1)
+def main(project: str, instance: str, value):
+    client = monitoring_v3.MetricServiceClient()
+    project_name = f"projects/{project}"
+
+    if not instance:
+        instance = requests.get(
+            "http://metadata/computeMetadata/v1/instance/id", headers={'Metadata-Flavor': 'Google'}
+        ).text
+    series = monitoring_v3.TimeSeries()
+    series.metric.type = CUSTOM_METRICS_TYPE
+    series.resource.type = "gce_instance"
+    series.resource.labels["instance_id"] = instance
+    series.resource.labels["zone"] = DEFAULT_ZONE
+    now = time.time()
+    seconds = int(now)
+    nanos = int((now - seconds) * 10 ** 9)
+    interval = monitoring_v3.TimeInterval({"end_time": {"seconds": seconds, "nanos": nanos}})
+    point = monitoring_v3.Point({"interval": interval, "value": {"int64_value": value}})
+    series.points = [point]
+    client.create_time_series(name=project_name, time_series=[series])
+    print(f"Reported {CUSTOM_METRICS_TYPE} with value {value}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/lambdas/scale_out_runner/app.py b/lambdas/scale_out_runner/app.py
index 2f40fb1..e011794 100644
--- a/lambdas/scale_out_runner/app.py
+++ b/lambdas/scale_out_runner/app.py
@@ -29,7 +29,8 @@ from chalice.app import Request
 app = Chalice(app_name='scale_out_runner')
 app.log.setLevel(logging.INFO)
 
-ASG_GROUP_NAME = os.getenv('ASG_NAME', 'AshbRunnerASG')
+AWS_ASG_GROUP_NAME = os.getenv('AWS_ASG_NAME', 'AshbRunnerASG')
+GCP_ASG_GROUP_NAME = os.getenv('GCP_ASG_NAME', 'AshbRunnerASG')
 TABLE_NAME = os.getenv('COUNTER_TABLE', 'GithubRunnerQueue')
 _commiters = set()
 GH_WEBHOOK_TOKEN = None
@@ -180,7 +181,7 @@ def scale_asg_if_needed(num_queued_jobs: int) -> dict:
     asg = boto3.client('autoscaling')
 
     resp = asg.describe_auto_scaling_groups(
-        AutoScalingGroupNames=[ASG_GROUP_NAME],
+        AutoScalingGroupNames=[AWS_ASG_GROUP_NAME],
     )
 
     asg_info = resp['AutoScalingGroups'][0]
@@ -199,7 +200,7 @@ def scale_asg_if_needed(num_queued_jobs: int) -> dict:
         if new_size <= max_size or current < max_size:
             try:
                 new_size = min(new_size, max_size)
-                asg.set_desired_capacity(AutoScalingGroupName=ASG_GROUP_NAME, DesiredCapacity=new_size)
+                asg.set_desired_capacity(AutoScalingGroupName=AWS_ASG_GROUP_NAME, DesiredCapacity=new_size)
                 return {'new_capcity': new_size}
             except asg.exceptions.ScalingActivityInProgressFault as e:
                 return {'error': str(e)}
diff --git a/lambdas/scale_out_runner/requirements.txt b/lambdas/scale_out_runner/requirements.txt
index c5402f0..ac067a9 100644
--- a/lambdas/scale_out_runner/requirements.txt
+++ b/lambdas/scale_out_runner/requirements.txt
@@ -17,3 +17,4 @@
 
 boto3
 chalice
+google-cloud-compute
diff --git a/requirements.txt b/requirements.txt
index 69d30f2..b8b2e0b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,8 @@
 boto3
 click~=7.1
 chalice
+google-cloud-monitoring
+google-cloud-compute
 pytest~=6.0
 python-dynamodb-lock
 psutil
diff --git a/scripts/runner-supervisor.py b/scripts/runner-supervisor.py
index d3d0e0c..60fb5aa 100755
--- a/scripts/runner-supervisor.py
+++ b/scripts/runner-supervisor.py
@@ -137,7 +137,7 @@ def main(repo, output_folder, user):
 
     # Just keep trying until we get some credentials.
     while True:
-        # Have each runner try to get a credential in a random order.
+        # Have each runner try to get a credential iKMSn a random order.
         possibles = get_possible_credentials(repo)
         random.shuffle(possibles)