You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by po...@apache.org on 2022/06/29 08:48:45 UTC
[airflow] 38/39: Script to filter candidates for PR of the month based on heuristics (#24654)
This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch v2-3-test
in repository https://gitbox.apache.org/repos/asf/airflow.git
commit 2905ede084229c4b8675cfcd5d11a82cad1d2739
Author: Jarek Potiuk <ja...@polidea.com>
AuthorDate: Tue Jun 28 23:30:59 2022 +0200
Script to filter candidates for PR of the month based on heuristics (#24654)
This scripts proposes top candidates for PR of the month
based on simple heuristics as discussed in the document
https://docs.google.com/document/d/1qO5FztgzJLccfvbagX8DLh1EwhFVD2nUqbw96fRJmQQ/edit?disco=AAAAZ-Ct0Bs&usp_dm=true
(cherry picked from commit 0e1a6b98079814747205e3320e43e11f8e2ef3d4)
---
dev/requirements.txt | 2 +
dev/stats/get_important_pr_candidates.py | 180 +++++++++++++++++++++++++++++++
2 files changed, 182 insertions(+)
diff --git a/dev/requirements.txt b/dev/requirements.txt
index e83bdbe216..31a4ad8405 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -1,9 +1,11 @@
click>=8.0
+cached_property;python_version<"3.8"
jinja2>=2.10
keyring==10.1
PyGithub
jsonpath_ng
jsonschema
+pendulum
pyyaml
packaging
rich
diff --git a/dev/stats/get_important_pr_candidates.py b/dev/stats/get_important_pr_candidates.py
new file mode 100755
index 0000000000..059742fc19
--- /dev/null
+++ b/dev/stats/get_important_pr_candidates.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import math
+import sys
+import textwrap
+from datetime import datetime
+from typing import List, Set
+
+import pendulum
+import rich_click as click
+from github import Github
+from github.PullRequest import PullRequest
+from rich.console import Console
+
+if sys.version_info >= (3, 8):
+ from functools import cached_property
+else:
+ from cached_property import cached_property
+
+PROVIDER_LABEL = "area:providers"
+
+logger = logging.getLogger(__name__)
+
+console = Console(width=400, color_system="standard")
+
+option_github_token = click.option(
+ "--github-token",
+ type=str,
+ required=True,
+ help=textwrap.dedent(
+ """
+ GitHub token used to authenticate.
+ You can set omit it if you have GITHUB_TOKEN env variable set
+ Can be generated with:
+ https://github.com/settings/tokens/new?description=Read%20issues&scopes=repo:status"""
+ ),
+ envvar='GITHUB_TOKEN',
+)
+
+PROVIDER_SCORE = 0.5
+REGULAR_SCORE = 1.0
+
+REVIEW_INTERACTION_VALUE = 1.0
+COMMENT_INTERACTION_VALUE = 1.0
+REACTION_INTERACTION_VALUE = 0.1
+
+
+class PrStat:
+ def __init__(self, pull_request: PullRequest):
+ self.pull_request = pull_request
+ self._users: Set[str] = set()
+
+ @cached_property
+ def label_score(self) -> float:
+ for label in self.pull_request.labels:
+ if "provider" in label.name:
+ return PROVIDER_SCORE
+ return REGULAR_SCORE
+
+ @cached_property
+ def num_interactions(self) -> float:
+ interactions = 0.0
+ for comment in self.pull_request.get_comments():
+ interactions += COMMENT_INTERACTION_VALUE
+ self._users.add(comment.user.login)
+ for _ in comment.get_reactions():
+ interactions += REACTION_INTERACTION_VALUE
+ for review in self.pull_request.get_reviews():
+ interactions += REVIEW_INTERACTION_VALUE
+ self._users.add(review.user.login)
+ return interactions
+
+ @cached_property
+ def num_interacting_users(self) -> int:
+ _ = self.num_interactions # make sure the _users set is populated
+ return len(self._users)
+
+ @cached_property
+ def num_changed_files(self) -> float:
+ return self.pull_request.changed_files
+
+ @cached_property
+ def score(self):
+ return (
+ 1.0
+ * self.num_interactions
+ * self.label_score
+ * self.num_interacting_users
+ / (math.log10(self.num_changed_files) if self.num_changed_files > 10 else 1.0)
+ )
+
+ def __str__(self) -> str:
+ return (
+ f"Score: {self.score:.2f}: PR{self.pull_request.number} by @{self.pull_request.user.login}: "
+ f"`{self.pull_request.title}. "
+ f"Merged at {self.pull_request.merged_at}: {self.pull_request.html_url}"
+ )
+
+
+DAYS_BACK = 5
+# Current (or previous during first few days of the next month)
+DEFAULT_BEGINNING_OF_MONTH = pendulum.now().subtract(days=DAYS_BACK).start_of('month')
+DEFAULT_END_OF_MONTH = DEFAULT_BEGINNING_OF_MONTH.end_of('month').add(days=1)
+
+MAX_PR_CANDIDATES = 500
+DEFAULT_TOP_PRS = 10
+
+
+@click.command()
+@click.option(
+ '--date-start', type=click.DateTime(formats=["%Y-%m-%d"]), default=str(DEFAULT_BEGINNING_OF_MONTH.date())
+)
+@click.option(
+ '--date-end', type=click.DateTime(formats=["%Y-%m-%d"]), default=str(DEFAULT_END_OF_MONTH.date())
+)
+@click.option('--top-number', type=int, default=DEFAULT_TOP_PRS)
+@click.option('--verbose', is_flag="True", help="Print scoring details")
+@option_github_token
+def main(github_token: str, date_start: datetime, date_end: datetime, top_number: int, verbose: bool):
+ console.print(f"Finding best candidate PRs between {date_start} and {date_end}")
+ g = Github(github_token)
+ repo = g.get_repo("apache/airflow")
+ pulls = repo.get_pulls(state="closed", sort="created", direction='desc')
+ issue_num = 0
+ selected_prs: List[PrStat] = []
+ for pr in pulls:
+ issue_num += 1
+ if not pr.merged:
+ continue
+ if not (date_start < pr.merged_at < date_end):
+ console.print(
+ f"[bright_blue]Skipping {pr.number} {pr.title} as it was not "
+ f"merged between {date_start} and {date_end}]"
+ )
+ continue
+ if pr.created_at < date_start:
+ console.print("[bright_blue]Completed selecting candidates")
+ break
+ pr_stat = PrStat(pull_request=pr) # type: ignore
+ console.print(
+ f"[green]Selecting PR: #{pr.number} `{pr.title}` as candidate."
+ f"Score: {pr_stat.score}[/]."
+ f" Url: {pr.html_url}"
+ )
+ if verbose:
+ console.print(
+ f'[bright_blue]Created at: {pr.created_at}, Merged at: {pr.merged_at}, '
+ f'Overall score: {pr_stat.score:.2f}, '
+ f'Label score: {pr_stat.label_score}, '
+ f'Interactions: {pr_stat.num_interactions}, '
+ f'Users interacting: {pr_stat.num_interacting_users}, '
+ f'Changed files: {pr_stat.num_changed_files}\n'
+ )
+ selected_prs.append(pr_stat)
+ if issue_num == MAX_PR_CANDIDATES:
+ console.print(f'[red]Reached {MAX_PR_CANDIDATES}. Stopping')
+ break
+ console.print(f"Top {top_number} PRs:")
+ for pr_stat in sorted(selected_prs, key=lambda s: -s.score)[:top_number]:
+ console.print(f" * {pr_stat}")
+
+
+if __name__ == "__main__":
+ main()