You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2022/05/31 21:01:08 UTC

[arrow] branch master updated: ARROW-16602: [Dev] Use GitHub API to merge pull request (#13184)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 01e4ad095a ARROW-16602: [Dev] Use GitHub API to merge pull request (#13184)
01e4ad095a is described below

commit 01e4ad095a7649afc7a7316447bc5cc1fd9679a2
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Wed Jun 1 06:00:59 2022 +0900

    ARROW-16602: [Dev] Use GitHub API to merge pull request (#13184)
    
    We use local "git merge" to merge a pull request in
    dev/merge_arrow_pr.py.
    
    If we use "git merge" to merge a pull request, GitHub's Web UI shows
    "Closed" mark not "Merged" mark in a pull request page. This sometimes
    confuses new contributors. "Why was my pull request closed without
    merging?" See
    https://github.com/apache/arrow/pull/12004#issuecomment-1031619771 for
    example.
    
    If we use GitHub API
    https://docs.github.com/en/rest/pulls/pulls#merge-a-pull-request to
    merge a pull request, GitHub's Web UI shows "Merged" mark not "Closed"
    mark. See https://github.com/apache/arrow/pull/13180 for example. I
    used GitHub API to merge the pull request.
    
    And we don't need to create a local branch on local repository to
    merge a pull request. But we must specify ARROW_GITHUB_API_TOKEN to
    run dev/merge_arrow_pr.py.
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 dev/README.md                  |   6 +-
 dev/merge.conf.sample          |   4 +
 dev/merge_arrow_pr.py          | 202 ++++++++++++++++++-----------------------
 dev/release/post-04-website.sh |   2 +-
 dev/test_merge_arrow_pr.py     |  18 ++--
 5 files changed, 104 insertions(+), 128 deletions(-)

diff --git a/dev/README.md b/dev/README.md
index d861944e1e..267bf008ac 100644
--- a/dev/README.md
+++ b/dev/README.md
@@ -53,9 +53,9 @@ After installed, it runs the merge script.
 have to install Python dependencies yourself and then run `dev/merge_arrow_pr.py`
 directly)
 
-The merge script uses the GitHub REST API; if you encounter rate limit issues,
-you may set a `ARROW_GITHUB_API_TOKEN` environment variable to use a Personal
-Access Token.
+The merge script uses the GitHub REST API. You must set a
+`ARROW_GITHUB_API_TOKEN` environment variable to use a Personal Access
+Token. You need to add `workflow` scope to the Personal Access Token.
 
 You can specify the username and the password of your JIRA account in
 `APACHE_JIRA_USERNAME` and `APACHE_JIRA_PASSWORD` environment variables.
diff --git a/dev/merge.conf.sample b/dev/merge.conf.sample
index c71b211614..9c0aa41485 100644
--- a/dev/merge.conf.sample
+++ b/dev/merge.conf.sample
@@ -23,3 +23,7 @@
 # token credentials. Ensure that the file is properly protected.
 username=johnsmith
 password=123456
+
+[github]
+# GitHub's personal access token. "workflow" scope is needed.
+api_token=ghp_ABC
diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py
index 012a2ac6e7..117bdda562 100755
--- a/dev/merge_arrow_pr.py
+++ b/dev/merge_arrow_pr.py
@@ -33,8 +33,7 @@
 # Configuration environment variables:
 #   - APACHE_JIRA_USERNAME: your Apache JIRA ID
 #   - APACHE_JIRA_PASSWORD: your Apache JIRA password
-#   - ARROW_GITHUB_API_TOKEN: a GitHub API token to use for API requests (to
-#                             avoid rate limiting)
+#   - ARROW_GITHUB_API_TOKEN: a GitHub API token to use for API requests
 #   - PR_REMOTE_NAME: the name of the remote to the Apache git repo (set to
 #                     'apache' by default)
 #   - DEBUG: use for testing to avoid pushing to apache (0 by default)
@@ -71,14 +70,12 @@ if DEBUG:
     print("**************** DEBUGGING ****************")
 
 
-# Prefix added to temporary branches
-BRANCH_PREFIX = "PR_TOOL"
 JIRA_API_BASE = "https://issues.apache.org/jira"
 
 
 def get_json(url, headers=None):
-    req = requests.get(url, headers=headers)
-    return req.json()
+    response = requests.get(url, headers=headers)
+    return response.json()
 
 
 def run_cmd(cmd):
@@ -101,21 +98,6 @@ def run_cmd(cmd):
     return output
 
 
-original_head = run_cmd("git rev-parse HEAD")[:8]
-
-
-def clean_up():
-    print("Restoring head pointer to %s" % original_head)
-    run_cmd("git checkout %s" % original_head)
-
-    branches = run_cmd("git branch").replace(" ", "").split("\n")
-
-    for branch in [x for x in branches
-                   if x.startswith(BRANCH_PREFIX)]:
-        print("Deleting local branch %s" % branch)
-        run_cmd("git branch -D %s" % branch)
-
-
 _REGEX_CI_DIRECTIVE = re.compile(r'\[[^\]]*\]')
 
 
@@ -255,20 +237,48 @@ URL\t\t{}/{}""".format(jira_id, summary, assignee, components, status,
 
 class GitHubAPI(object):
 
-    def __init__(self, project_name):
+    def __init__(self, project_name, cmd):
         self.github_api = ("https://api.github.com/repos/apache/{0}"
                            .format(project_name))
 
-        token = os.environ.get('ARROW_GITHUB_API_TOKEN', None)
-        if token:
-            self.headers = {'Authorization': 'token {0}'.format(token)}
-        else:
-            self.headers = None
+        token = None
+        config = load_configuration()
+        if "github" in config.sections():
+            token = config["github"]["api_token"]
+        if not token:
+            token = os.environ.get('ARROW_GITHUB_API_TOKEN')
+        if not token:
+            token = cmd.prompt('Env ARROW_GITHUB_API_TOKEN not set, '
+                               'please enter your GitHub API token '
+                               '(GitHub personal access token):')
+        headers = {
+            'Accept': 'application/vnd.github.v3+json',
+            'Authorization': 'token {0}'.format(token),
+        }
+        self.headers = headers
 
     def get_pr_data(self, number):
         return get_json("%s/pulls/%s" % (self.github_api, number),
                         headers=self.headers)
 
+    def get_pr_commits(self, number):
+        return get_json("%s/pulls/%s/commits" % (self.github_api, number),
+                        headers=self.headers)
+
+    def merge_pr(self, number, commit_title, commit_message):
+        url = f'{self.github_api}/pulls/{number}/merge'
+        payload = {
+            'commit_title': commit_title,
+            'commit_message': commit_message,
+            'merge_method': 'squash',
+        }
+        response = requests.put(url, headers=self.headers, json=payload)
+        result = response.json()
+        if response.status_code != 200 and 'merged' not in result:
+            result['merged'] = False
+            result['message'] += f': {url}'
+        return result
+
 
 class CommandInput(object):
     """
@@ -276,7 +286,6 @@ class CommandInput(object):
     """
 
     def fail(self, msg):
-        clean_up()
         raise Exception(msg)
 
     def prompt(self, prompt):
@@ -300,6 +309,7 @@ class PullRequest(object):
 
     def __init__(self, cmd, github_api, git_remote, jira_con, number):
         self.cmd = cmd
+        self._github_api = github_api
         self.git_remote = git_remote
         self.con = jira_con
         self.number = number
@@ -358,35 +368,23 @@ class PullRequest(object):
         """
         merge the requested PR and return the merge hash
         """
-        pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, self.number)
-        target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX,
-                                                    self.number,
-                                                    self.target_ref.upper())
-        run_cmd("git fetch %s pull/%s/head:%s" % (self.git_remote,
-                                                  self.number,
-                                                  pr_branch_name))
-        run_cmd("git fetch %s %s:%s" % (self.git_remote, self.target_ref,
-                                        target_branch_name))
-        run_cmd("git checkout %s" % target_branch_name)
-
-        had_conflicts = False
-        try:
-            run_cmd(['git', 'merge', pr_branch_name, '--ff', '--squash'])
-        except Exception as e:
-            msg = ("Error merging: %s\nWould you like to "
-                   "manually fix-up this merge?" % e)
-            self.cmd.continue_maybe(msg)
-            msg = ("Okay, please fix any conflicts and 'git add' "
-                   "conflicting files... Finished?")
-            self.cmd.continue_maybe(msg)
-            had_conflicts = True
-
-        commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
-                                 '--pretty=format:%an <%ae>']).split("\n")
-        commit_co_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
-                                    '--pretty=%(trailers:key=Co-authored-by,'
-                                     'valueonly)']).split("\n")
-        commit_co_authors = list(filter(None, commit_co_authors))
+        commits = self._github_api.get_pr_commits(self.number)
+
+        def format_commit_author(commit):
+            author = commit['commit']['author']
+            name = author['name']
+            email = author['email']
+            return f'{name} <{email}>'
+        commit_authors = [format_commit_author(commit) for commit in commits]
+        co_authored_by_re = re.compile(r'^Co-authored-by:\s*(.*)')
+
+        def extract_co_authors(commit):
+            message = commit['commit']['message']
+            return co_authored_by_re.findall(message)
+        commit_co_authors = []
+        for commit in commits:
+            commit_co_authors.extend(extract_co_authors(commit))
+
         all_commit_authors = commit_authors + commit_co_authors
         distinct_authors = sorted(set(all_commit_authors),
                                   key=lambda x: commit_authors.count(x),
@@ -396,74 +394,51 @@ class PullRequest(object):
             print("Author {}: {}".format(i + 1, author))
 
         if len(distinct_authors) > 1:
-            primary_author, distinct_authors = get_primary_author(
+            primary_author, distinct_other_authors = get_primary_author(
                 self.cmd, distinct_authors)
         else:
             # If there is only one author, do not prompt for a lead author
-            primary_author = distinct_authors[0]
-
-        merge_message_flags = []
+            primary_author = distinct_authors.pop()
+            distinct_other_authors = []
 
-        merge_message_flags += ["-m", self.title]
+        commit_title = f'{self.title} (#{self.number})'
+        commit_message_chunks = []
         if self.body is not None:
-            merge_message_flags += ["-m", self.body]
+            commit_message_chunks.append(self.body)
 
         committer_name = run_cmd("git config --get user.name").strip()
         committer_email = run_cmd("git config --get user.email").strip()
 
-        authors = ("Authored-by:" if len(distinct_authors) == 1
+        authors = ("Authored-by:" if len(distinct_other_authors) == 0
                    else "Lead-authored-by:")
-        authors += " %s" % (distinct_authors.pop(0))
+        authors += " %s" % primary_author
         if len(distinct_authors) > 0:
             authors += "\n" + "\n".join(["Co-authored-by: %s" % a
-                                         for a in distinct_authors])
+                                         for a in distinct_other_authors])
         authors += "\n" + "Signed-off-by: %s <%s>" % (committer_name,
                                                       committer_email)
+        commit_message_chunks.append(authors)
 
-        if had_conflicts:
-            committer_name = run_cmd("git config --get user.name").strip()
-            committer_email = run_cmd("git config --get user.email").strip()
-            message = ("This patch had conflicts when merged, "
-                       "resolved by\nCommitter: %s <%s>" %
-                       (committer_name, committer_email))
-            merge_message_flags += ["-m", message]
-
-        # The string "Closes #%s" string is required for GitHub to correctly
-        # close the PR
-        merge_message_flags += [
-            "-m",
-            "Closes #%s from %s"
-            % (self.number, self.description)]
-        merge_message_flags += ["-m", authors]
+        commit_message = "\n\n".join(commit_message_chunks)
 
         if DEBUG:
-            print("\n".join(merge_message_flags))
-
-        run_cmd(['git', 'commit',
-                 '--no-verify',  # do not run commit hooks
-                 '--author="%s"' % primary_author] +
-                merge_message_flags)
-
-        self.cmd.continue_maybe("Merge complete (local ref %s). Push to %s?"
-                                % (target_branch_name, self.git_remote))
+            print(commit_title)
+            print()
+            print(commit_message)
 
-        try:
-            push_cmd = ('git push %s %s:%s' % (self.git_remote,
-                                               target_branch_name,
-                                               self.target_ref))
-            if DEBUG:
-                print(push_cmd)
-            else:
-                run_cmd(push_cmd)
-        except Exception as e:
-            clean_up()
-            self.cmd.fail("Exception while pushing: %s" % e)
+        if DEBUG:
+            merge_hash = None
+        else:
+            result = self._github_api.merge_pr(self.number,
+                                               commit_title,
+                                               commit_message)
+            if not result['merged']:
+                message = result['message']
+                self.cmd.fail(f'Failed to merge pull request: {message}')
+            merge_hash = result['sha']
 
-        merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8]
-        clean_up()
         print("Pull request #%s merged!" % self.number)
         print("Merge hash: %s" % merge_hash)
-        return merge_hash
 
 
 def get_primary_author(cmd, distinct_authors):
@@ -475,7 +450,7 @@ def get_primary_author(cmd, distinct_authors):
             "\"name <email>\" [%s]: " % distinct_authors[0])
 
         if primary_author == "":
-            return distinct_authors[0], distinct_authors
+            return distinct_authors[0], distinct_authors[1:]
 
         if author_pat.match(primary_author):
             break
@@ -483,10 +458,9 @@ def get_primary_author(cmd, distinct_authors):
 
     # When primary author is specified manually, de-dup it from
     # author list and put it at the head of author list.
-    distinct_authors = [x for x in distinct_authors
-                        if x != primary_author]
-    distinct_authors = [primary_author] + distinct_authors
-    return primary_author, distinct_authors
+    distinct_other_authors = [x for x in distinct_authors
+                              if x != primary_author]
+    return primary_author, distinct_other_authors
 
 
 def prompt_for_fix_version(cmd, jira_issue):
@@ -581,25 +555,23 @@ def cli():
 
     os.chdir(ARROW_HOME)
 
-    github_api = GitHubAPI(PROJECT_NAME)
+    github_api = GitHubAPI(PROJECT_NAME, cmd)
 
     jira_con = connect_jira(cmd)
     pr = PullRequest(cmd, github_api, PR_REMOTE_NAME, jira_con, pr_num)
 
     if pr.is_merged:
-        print("Pull request %s has already been merged")
+        print("Pull request %s has already been merged" % pr_num)
         sys.exit(0)
 
     if not pr.is_mergeable:
-        msg = ("Pull request %s is not mergeable in its current form.\n"
-               % pr_num + "Continue? (experts only!)")
-        cmd.continue_maybe(msg)
+        print("Pull request %s is not mergeable in its current form" % pr_num)
+        sys.exit(1)
 
     pr.show()
 
     cmd.continue_maybe("Proceed with merging pull request #%s?" % pr_num)
 
-    # merged hash not used
     pr.merge()
 
     if pr.jira_issue is None:
diff --git a/dev/release/post-04-website.sh b/dev/release/post-04-website.sh
index d8a9df4249..0f41a97e66 100755
--- a/dev/release/post-04-website.sh
+++ b/dev/release/post-04-website.sh
@@ -67,7 +67,7 @@ rough_n_development_months=$((
 git_tag=apache-arrow-${version}
 git_range=apache-arrow-${previous_version}..${git_tag}
 
-committers_command_line="git shortlog -csn ${git_range}"
+committers_command_line="git shortlog -sn --group=trailer:signed-off-by ${git_range}"
 contributors_command_line="git shortlog -sn ${git_range}"
 
 committers=$(${committers_command_line})
diff --git a/dev/test_merge_arrow_pr.py b/dev/test_merge_arrow_pr.py
old mode 100644
new mode 100755
index 8fe1883508..5452152828
--- a/dev/test_merge_arrow_pr.py
+++ b/dev/test_merge_arrow_pr.py
@@ -211,22 +211,22 @@ def test_multiple_authors_bad_input():
     distinct_authors = [a0, a1]
 
     cmd = FakeCLI(responses=[''])
-    primary_author, new_distinct_authors = merge_arrow_pr.get_primary_author(
-        cmd, distinct_authors)
+    primary_author, distinct_other_authors = \
+        merge_arrow_pr.get_primary_author(cmd, distinct_authors)
     assert primary_author == a0
-    assert new_distinct_authors == [a0, a1]
+    assert distinct_other_authors == [a1]
 
     cmd = FakeCLI(responses=['oops', a1])
-    primary_author, new_distinct_authors = merge_arrow_pr.get_primary_author(
-        cmd, distinct_authors)
+    primary_author, distinct_other_authors = \
+        merge_arrow_pr.get_primary_author(cmd, distinct_authors)
     assert primary_author == a1
-    assert new_distinct_authors == [a1, a0]
+    assert distinct_other_authors == [a0]
 
     cmd = FakeCLI(responses=[a2])
-    primary_author, new_distinct_authors = merge_arrow_pr.get_primary_author(
-        cmd, distinct_authors)
+    primary_author, distinct_other_authors = \
+        merge_arrow_pr.get_primary_author(cmd, distinct_authors)
     assert primary_author == a2
-    assert new_distinct_authors == [a2, a0, a1]
+    assert distinct_other_authors == [a0, a1]
 
 
 def test_jira_already_resolved():