You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2021/05/02 16:55:45 UTC
[airflow-ci-infra] 01/01: Cloud init is adapted to run GCE instance
This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch add-gcp-cloud-init
in repository https://gitbox.apache.org/repos/asf/airflow-ci-infra.git
commit e20c7b07d10d012f12ea974e3c5656fabf85005a
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Sun May 2 18:54:44 2021 +0200
Cloud init is adapted to run GCE instance
---
cloud-init.yml | 66 +++++++++++++++++-----
gcp/README.md | 45 +++++++++++++++
gcp/metrics/gcp_create_metrics_descriptor.py | 56 ++++++++++++++++++
.../metrics/gcp_delete_metrics_descriptor.py | 23 +++++++-
gcp/metrics/gcp_write_metrics_data.py | 57 +++++++++++++++++++
lambdas/scale_out_runner/app.py | 7 ++-
lambdas/scale_out_runner/requirements.txt | 1 +
requirements.txt | 2 +
scripts/runner-supervisor.py | 2 +-
9 files changed, 240 insertions(+), 19 deletions(-)
diff --git a/cloud-init.yml b/cloud-init.yml
index 05d71d5..af14d2c 100644
--- a/cloud-init.yml
+++ b/cloud-init.yml
@@ -39,14 +39,21 @@ runcmd:
- -c
- |
set -eu -o pipefail
- echo "AWS_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment
+ if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+ echo "AWS_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment
+ fi
+ if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+ echo "GCP_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment
+ fi
# Set an env var (that is visible in runners) that will let us know we are on a self-hosted runner
echo 'AIRFLOW_SELF_HOSTED_RUNNER="[\"self-hosted\"]"' >> /etc/environment
set -a
. /etc/environment
set +a
- echo "ASG_GROUP_NAME=$(aws ec2 describe-tags --filter Name=resource-id,Values=$(cloud-init query instance_id) Name=key,Values=aws:autoscaling:groupName \
- | jq -r '@sh "\(.Tags[0].Value)"')" >> /etc/environment
+ if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+ echo "ASG_GROUP_NAME=$(aws ec2 describe-tags --filter Name=resource-id,Values=$(cloud-init query instance_id) Name=key,Values=aws:autoscaling:groupName \
+ | jq -r '@sh "\(.Tags[0].Value)"')" >> /etc/environment
+ fi
- [systemctl, daemon-reload]
-
- bash
@@ -75,10 +82,27 @@ runcmd:
. /etc/environment
set +a
- aws s3 cp s3://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor
+ if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+ aws s3 cp s3://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor
+ fi
+ if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+ gsutil cp gs://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor
+ fi
chmod 755 /opt/runner-supervisor/bin/runner-supervisor
-
- 2.277.1-airflow3
+ -
+ - bash
+ - -c
+ - |
+ set -eu -o pipefail
+ if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+ gsutil cp gs://airflow-ci-assets/requirements.txt /opt/requirements.txt
+ python3 -mvenv /opt/gcp-metrics-writer
+ /opt/gcp-metrics-writer/bin/pip install -r /opt/requirements.txt
+ gsutil cp gs://airflow-ci-assets/gcp_write_metrics_data.py /opt/gcp-metrics-writer/bin/gcp_write_metrics_data
+ chmod a+x /opt/gcp-metrics-writer/bin/gcp_write_metrics_data
+ fi
+
- [systemctl, enable, --now, iptables.service]
# Restart docker after applying the user firewall -- else some rules/chains might be list!
- [systemctl, restart, docker.service]
@@ -120,9 +144,15 @@ write_files:
docker ps -qa | xargs --verbose --no-run-if-empty docker rm -fv
echo "Log in to a paid docker user to get unlimited docker pulls"
- aws ssm get-parameter --with-decryption --name /runners/apache/airflow/dockerPassword | \
- jq .Parameter.Value -r | \
- sudo -u runner docker login --username airflowcirunners --password-stdin
+ if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+ aws ssm get-parameter --with-decryption --name /runners/apache/airflow/dockerPassword | \
+ jq .Parameter.Value -r | \
+ sudo -u runner docker login --username airflowcirunners --password-stdin
+ fi
+ if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+ gcloud secrets versions access latest --secret "runners-apache-airflow-dockerPassword" | \
+ sudo -u runner docker login --username airflowcirunners --password-stdin
+ fi
if [[ -d ~runner/actions-runner/_work/airflow/airflow ]]; then
cd ~runner/actions-runner/_work/airflow/airflow
@@ -134,7 +164,8 @@ write_files:
git submodule deinit --all -f && \
git submodule foreach git clean -fxd && \
git clean -fxd \
- "
+
+
fi
fi
owner: root:root
@@ -176,7 +207,11 @@ write_files:
runner ALL=(ALL) NOPASSWD:/usr/sbin/swapoff -a, /usr/bin/rm -f /swapfile, /usr/bin/apt clean
- path: /etc/iptables/rules.v4
content: |
+ #
# Generated by iptables-save v1.8.4 on Thu Jan 14 13:59:27 2021
+ # The Metadata server IP address is the same for AWS and GCP: 169.254.169.254
+ # Which is pretty cool.
+ #
*filter
:INPUT ACCEPT [833:75929]
:FORWARD DROP [0:0]
@@ -188,18 +223,23 @@ write_files:
-A DOCKER-USER -j RETURN
COMMIT
- - path: /usr/local/sbin/actions-runner-ec2-reporting
+ - path: /usr/local/sbin/actions-runner-reporting
permissions: '0775'
content: |
#!/bin/bash
set -eu -o pipefail
if pgrep -c Runner.Worker >/dev/null; then
# Only report metric when we're doing something -- no point paying to submit zeros
- aws cloudwatch put-metric-data --metric-name jobs-running --value "$(pgrep -c Runner.Worker)" --namespace github.actions
+ if [[ $(cloud-init query cloud_name) == "aws" ]]; then
+ aws cloudwatch put-metric-data --metric-name jobs-running --value "$(pgrep -c Runner.Worker)" --namespace github.actions
+ fi
+ if [[ $(cloud-init query cloud_name) == "gce" ]]; then
+ /opt/gcp-metrics-writer/bin/python /opt/gcp-metrics-writer/bin/gcp_write_metrics_data --value "$(pgrep -c Runner.Worker)"
+ fi
fi
- - path: /etc/cron.d/cloudwatch-metrics-github-runners
+ - path: /etc/cron.d/metrics-github-runners
content: |
- */1 * * * * nobody /usr/local/sbin/actions-runner-ec2-reporting
+ */1 * * * * nobody /usr/local/sbin/actions-runner-reporting
- path: /etc/systemd/system/actions.runner-supervisor.service
content: |
diff --git a/gcp/README.md b/gcp/README.md
new file mode 100644
index 0000000..ca74299
--- /dev/null
+++ b/gcp/README.md
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Notes from setting up GCP version of Airflow CI runner
+
+Those are notes taken while setting-up GCP version of the Runner.
+
+1. Created a new service account without any permissions
+
+airlfow-ci-runner@apache-airflow-ci-cd.iam.gserviceaccount.com
+
+2. Created custom roles with those permissions:
+
+* Monitoring Metric Writer
+ * monitoring.timeSeries.create
+
+2. Created `runners-apache-airflow-dockerPassword` secret with the same value as in AWS.
+
+3. Assigned roles to the "airflow-ci-runner" service account:
+
+* Monitoring Metric Writer
+* Secret Manager Secret Accessor
+
+
+4. Created `airflow-ci-assets` GCS bucket with "public read" permissions
+
+5. Copied those files there (they need to be copied every time they are changed)
+ * gcp_write_metrics_data.py
+ * get-runner-creds.py
+ * requirements.txt
+ * runner-supervisor.py
diff --git a/gcp/metrics/gcp_create_metrics_descriptor.py b/gcp/metrics/gcp_create_metrics_descriptor.py
new file mode 100755
index 0000000..fda60be
--- /dev/null
+++ b/gcp/metrics/gcp_create_metrics_descriptor.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import click
+from google.api import label_pb2 as ga_label, metric_pb2 as ga_metric
+from google.cloud import monitoring_v3
+
+DEFAULT_PROJECT = 'apache-airflow-ci-cd'
+DEFAULT_ZONE = 'us-central1-a'
+CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running'
+
+
+@click.command()
+@click.option('--project', default=DEFAULT_PROJECT)
+def main(project: str):
+ client = monitoring_v3.MetricServiceClient()
+ project_name = f"projects/{project}"
+ descriptor = ga_metric.MetricDescriptor()
+ descriptor.display_name = "GitHub Actions jobs"
+ descriptor.type = CUSTOM_METRICS_TYPE
+ descriptor.metric_kind = ga_metric.MetricDescriptor.MetricKind.GAUGE
+ descriptor.value_type = ga_metric.MetricDescriptor.ValueType.INT64
+ descriptor.description = "Number of Jobs running for GitHub Actions."
+
+ label_instance_id = ga_label.LabelDescriptor()
+ label_instance_id.key = "instance_id"
+ label_instance_id.value_type = ga_label.LabelDescriptor.ValueType.STRING
+ label_instance_id.description = "The instance_id"
+ label_zone = ga_label.LabelDescriptor()
+ label_zone.key = "zone"
+ label_zone.value_type = ga_label.LabelDescriptor.ValueType.STRING
+ label_zone.description = "The zone"
+ descriptor.labels.append(label_instance_id)
+ descriptor.labels.append(label_zone)
+
+ descriptor = client.create_metric_descriptor(name=project_name, metric_descriptor=descriptor)
+ print(f"Created {descriptor.name}.")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/lambdas/scale_out_runner/requirements.txt b/gcp/metrics/gcp_delete_metrics_descriptor.py
old mode 100644
new mode 100755
similarity index 54%
copy from lambdas/scale_out_runner/requirements.txt
copy to gcp/metrics/gcp_delete_metrics_descriptor.py
index c5402f0..c7e6a0f
--- a/lambdas/scale_out_runner/requirements.txt
+++ b/gcp/metrics/gcp_delete_metrics_descriptor.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -15,5 +16,23 @@
# specific language governing permissions and limitations
# under the License.
-boto3
-chalice
+import click
+from google.cloud import monitoring_v3
+
+DEFAULT_PROJECT = 'apache-airflow-ci-cd'
+DEFAULT_ZONE = 'us-central1-a'
+CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running'
+
+
+@click.command()
+@click.option('--project', default=DEFAULT_PROJECT)
+@click.option('--zone', default=DEFAULT_ZONE)
+def main(project: str, zone: str):
+ client = monitoring_v3.MetricServiceClient()
+ descriptor_name = f"projects/{project}/metricDescriptors/{CUSTOM_METRICS_TYPE}"
+ client.delete_metric_descriptor(name=descriptor_name)
+ print(f"Deleted metric descriptor {descriptor_name}.")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/gcp/metrics/gcp_write_metrics_data.py b/gcp/metrics/gcp_write_metrics_data.py
new file mode 100755
index 0000000..56020f6
--- /dev/null
+++ b/gcp/metrics/gcp_write_metrics_data.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import time
+
+import click
+import requests
+from google.cloud import monitoring_v3
+
+DEFAULT_PROJECT = 'apache-airflow-ci-cd'
+DEFAULT_ZONE = 'us-central1-a'
+CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running'
+
+
+@click.command()
+@click.option('--project', default=DEFAULT_PROJECT)
+@click.option('--instance')
+@click.option('--value', type=int, default=1)
+def main(project: str, instance: str, value):
+ client = monitoring_v3.MetricServiceClient()
+ project_name = f"projects/{project}"
+
+ if not instance:
+ instance = requests.get(
+ "http://metadata/computeMetadata/v1/instance/id", headers={'Metadata-Flavor': 'Google'}
+ ).text
+ series = monitoring_v3.TimeSeries()
+ series.metric.type = CUSTOM_METRICS_TYPE
+ series.resource.type = "gce_instance"
+ series.resource.labels["instance_id"] = instance
+ series.resource.labels["zone"] = DEFAULT_ZONE
+ now = time.time()
+ seconds = int(now)
+ nanos = int((now - seconds) * 10 ** 9)
+ interval = monitoring_v3.TimeInterval({"end_time": {"seconds": seconds, "nanos": nanos}})
+ point = monitoring_v3.Point({"interval": interval, "value": {"int64_value": value}})
+ series.points = [point]
+ client.create_time_series(name=project_name, time_series=[series])
+ print(f"Reported {CUSTOM_METRICS_TYPE} with value {value}")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/lambdas/scale_out_runner/app.py b/lambdas/scale_out_runner/app.py
index 2f40fb1..e011794 100644
--- a/lambdas/scale_out_runner/app.py
+++ b/lambdas/scale_out_runner/app.py
@@ -29,7 +29,8 @@ from chalice.app import Request
app = Chalice(app_name='scale_out_runner')
app.log.setLevel(logging.INFO)
-ASG_GROUP_NAME = os.getenv('ASG_NAME', 'AshbRunnerASG')
+AWS_ASG_GROUP_NAME = os.getenv('AWS_ASG_NAME', 'AshbRunnerASG')
+GCP_ASG_GROUP_NAME = os.getenv('GCP_ASG_NAME', 'AshbRunnerASG')
TABLE_NAME = os.getenv('COUNTER_TABLE', 'GithubRunnerQueue')
_commiters = set()
GH_WEBHOOK_TOKEN = None
@@ -180,7 +181,7 @@ def scale_asg_if_needed(num_queued_jobs: int) -> dict:
asg = boto3.client('autoscaling')
resp = asg.describe_auto_scaling_groups(
- AutoScalingGroupNames=[ASG_GROUP_NAME],
+ AutoScalingGroupNames=[AWS_ASG_GROUP_NAME],
)
asg_info = resp['AutoScalingGroups'][0]
@@ -199,7 +200,7 @@ def scale_asg_if_needed(num_queued_jobs: int) -> dict:
if new_size <= max_size or current < max_size:
try:
new_size = min(new_size, max_size)
- asg.set_desired_capacity(AutoScalingGroupName=ASG_GROUP_NAME, DesiredCapacity=new_size)
+ asg.set_desired_capacity(AutoScalingGroupName=AWS_ASG_GROUP_NAME, DesiredCapacity=new_size)
return {'new_capcity': new_size}
except asg.exceptions.ScalingActivityInProgressFault as e:
return {'error': str(e)}
diff --git a/lambdas/scale_out_runner/requirements.txt b/lambdas/scale_out_runner/requirements.txt
index c5402f0..ac067a9 100644
--- a/lambdas/scale_out_runner/requirements.txt
+++ b/lambdas/scale_out_runner/requirements.txt
@@ -17,3 +17,4 @@
boto3
chalice
+google-cloud-compute
diff --git a/requirements.txt b/requirements.txt
index 69d30f2..b8b2e0b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,8 @@
boto3
click~=7.1
chalice
+google-cloud-monitoring
+google-cloud-compute
pytest~=6.0
python-dynamodb-lock
psutil
diff --git a/scripts/runner-supervisor.py b/scripts/runner-supervisor.py
index d3d0e0c..60fb5aa 100755
--- a/scripts/runner-supervisor.py
+++ b/scripts/runner-supervisor.py
@@ -137,7 +137,7 @@ def main(repo, output_folder, user):
# Just keep trying until we get some credentials.
while True:
- # Have each runner try to get a credential in a random order.
+ # Have each runner try to get a credential iKMSn a random order.
possibles = get_possible_credentials(repo)
random.shuffle(possibles)