You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2022/06/08 18:51:42 UTC

[GitHub] [beam] damccorm commented on a diff in pull request #21736: Gather metrics on GH Issues

damccorm commented on code in PR #21736:
URL: https://github.com/apache/beam/pull/21736#discussion_r892747723


##########
.test-infra/metrics/sync/github/sync.py:
##########
@@ -322,63 +379,107 @@ def upsertIntoPRsTable(cursor, values):
   cursor.execute(upsertPRRowQuery, values)
 
 
+def upsertIntoIssuesTable(cursor, values):
+  upsertPRRowQuery = f'''INSERT INTO {GH_ISSUES_TABLE_NAME}
+                            (issue_id,
+                            author,
+                            created_ts,
+                            updated_ts,
+                            closed_ts,
+                            title,
+                            assignees,
+                            labels)
+                          VALUES
+                            (%s, %s, %s, %s, %s, %s, %s, %s)
+                          ON CONFLICT (issue_id) DO UPDATE
+                            SET
+                            issue_id=excluded.issue_id,
+                            author=excluded.author,
+                            created_ts=excluded.created_ts,
+                            updated_ts=excluded.updated_ts,
+                            closed_ts=excluded.closed_ts,
+                            title=excluded.title,
+                            assignees=excluded.assignees,
+                            labels=excluded.labels
+                          '''
+  cursor.execute(upsertPRRowQuery, values)
+
+
 def fetchNewData():
   '''
   Main workhorse method. Fetches data from GitHub and puts it in metrics table.
   '''
-  connection = initDBConnection()
-  cursor = connection.cursor()
-  lastSyncTimestamp = fetchLastSyncTimestamp(cursor)
-  cursor.close()
-  connection.close()
-
-  currTS = lastSyncTimestamp
-
-  resultsPresent = True
-  while resultsPresent:
-    print("Syncing data for: ", currTS)
-    jsonData = fetchGHData(currTS)
+  for i in range(2):
+    kind = 'issue'
+    if i == 0:
+      kind = 'pr'
 
     connection = initDBConnection()
     cursor = connection.cursor()
+    lastSyncTimestamp = fetchLastSyncTimestamp(cursor, f'gh_{kind}_sync')
+    cursor.close()
+    connection.close()
+    if lastSyncTimestamp is None:
+      if kind == 'pr':
+        connection = initDBConnection()
+        cursor = connection.cursor()
+        lastSyncTimestamp = fetchLastSyncTimestampFallback(cursor)
+        cursor.close()
+        connection.close()
+      else:
+        lastSyncTimestamp = datetime(year=1980, month=1, day=1)
+
+    currTS = lastSyncTimestamp
+
+    resultsPresent = True
+    while resultsPresent:
+      print(f'Syncing data for {kind}s: ', currTS)
+      query = queries.MAIN_PR_QUERY if kind == 'pr' else queries.MAIN_ISSUES_QUERY
+      jsonData = fetchGHData(currTS, query)
+
+      connection = initDBConnection()
+      cursor = connection.cursor()
+
+      if "errors" in jsonData:
+        print("Failed to fetch data, error:", jsonData)
+        return
 
-    if "errors" in jsonData:
-      print("Failed to fetch data, error:", jsonData)
-      return
-
-    prs = None
-    try:
-      prs = jsonData["data"]["search"]["edges"]
-    except:
-      # TODO This means that API returned error.
-      # We might want to bring this to stderr or utilize other means of logging.
-      # Examples: we hit throttling, etc
-      print("Got bad json format: ", jsonData)
-      return
-
-    if not prs:
-      resultsPresent = False
-
-    for edge in prs:
-      pr = edge["node"]
+      data = None
       try:
-        rowValues = extractRowValuesFromPr(pr)
-      except Exception as e:
-        print("Failed to extract data. Exception: ", e, " PR: ", edge)
-        traceback.print_tb(e.__traceback__)
+        data = jsonData["data"]["search"]["edges"]
+      except:
+        # TODO This means that API returned error.
+        # We might want to bring this to stderr or utilize other means of logging.
+        # Examples: we hit throttling, etc
+        print("Got bad json format: ", jsonData)
         return
 
-      upsertIntoPRsTable(cursor, rowValues)
+      if not data:
+        resultsPresent = False
 
-      prUpdateTime = ghutilities.datetimeFromGHTimeStr(pr["updatedAt"])
+      for edge in data:
+        node = edge["node"]
+        try:
+          rowValues = extractRowValuesFromPr(node) if kind == 'pr' else extractRowValuesFromIssue(node)
+        except Exception as e:
+          print("Failed to extract data. Exception: ", e, f" {kind}: ", edge)
+          traceback.print_tb(e.__traceback__)
+          return
 
-      currTS = currTS if currTS > prUpdateTime else prUpdateTime
+        if kind == 'pr':
+          upsertIntoPRsTable(cursor, rowValues)
+        else:
+          upsertIntoIssuesTable(cursor, rowValues)
 
-    cursor.close()
-    connection.commit()
-    connection.close()
+        updateTime = ghutilities.datetimeFromGHTimeStr(node["updatedAt"])
+
+        currTS = currTS if currTS > updateTime else updateTime
+
+      cursor.close()
+      connection.commit()
+      connection.close()
 
-    updateLastSyncTimestamp(currTS)
+      updateLastSyncTimestamp(currTS, 'gh_pr_sync')

Review Comment:
   Yes, that's a good catch. Updated!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@beam.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org