You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kibble.apache.org by hu...@apache.org on 2018/02/26 19:21:40 UTC

[kibble-scanners] branch master updated (b60230e -> 69ed9aa)

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git.


    from b60230e  Try to grab the different states of 'started' jobs
     new 91b4108  Add initial Discourse scanner plugin
     new f8595af  Updates to JSON API
     new 69ed9aa  remove spurious comment

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/plugins/scanners/__init__.py  |   3 +-
 src/plugins/scanners/discourse.py | 294 ++++++++++++++++++++++++++++++++++++++
 src/plugins/utils/jsonapi.py      |   6 +-
 3 files changed, 300 insertions(+), 3 deletions(-)
 create mode 100644 src/plugins/scanners/discourse.py

-- 
To stop receiving notification emails like this one, please contact
humbedooh@apache.org.

[kibble-scanners] 03/03: remove spurious comment

Posted by hu...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git

commit 69ed9aacb90af5fb74befdee8e9f208a28b91c4c
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Mon Feb 26 20:21:28 2018 +0100

    remove spurious comment
---
 src/plugins/scanners/discourse.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/plugins/scanners/discourse.py b/src/plugins/scanners/discourse.py
index 42baf1e..52b37d7 100644
--- a/src/plugins/scanners/discourse.py
+++ b/src/plugins/scanners/discourse.py
@@ -42,7 +42,6 @@ def accepts(source):
 def scanJob(KibbleBit, source, cat, creds):
     """ Scans a single discourse category for activity """
     NOW = int(datetime.datetime.utcnow().timestamp())
-    #dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], job['name']) ).encode('ascii', errors='replace')).hexdigest()
     
     # Get $discourseURL/c/$catID
     

-- 
To stop receiving notification emails like this one, please contact
humbedooh@apache.org.

[kibble-scanners] 02/03: Updates to JSON API

Posted by hu...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git

commit f8595af0cf00747bd6017720dba344cdfee44d28
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Mon Feb 26 20:20:16 2018 +0100

    Updates to JSON API
    
    - We want JSON, so specify that
    - Sometimes we need a token (like for Travis) instead of basic auth
---
 src/plugins/utils/jsonapi.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/plugins/utils/jsonapi.py b/src/plugins/utils/jsonapi.py
index b896c50..5687156 100644
--- a/src/plugins/utils/jsonapi.py
+++ b/src/plugins/utils/jsonapi.py
@@ -24,15 +24,17 @@ import time
 import re
 import base64
 
-def get(url, cookie = None, auth = None):
+def get(url, cookie = None, auth = None, token = None):
     headers = {
         "Content-type": "application/json",
-        "Accept": "*/*"
+        "Accept": "application/json"
     }
     if auth:
         xcreds = auth.encode(encoding='ascii', errors='replace')
         bauth = base64.encodebytes(xcreds).decode('ascii', errors='replace').replace("\n", '')
         headers["Authorization"] = "Basic %s" % bauth
+    if token:
+        headers["Authorization"] = "token %s" % token
     if cookie:
         headers["Cookie"] = cookie
     rv = requests.get(url, headers = headers)

-- 
To stop receiving notification emails like this one, please contact
humbedooh@apache.org.

[kibble-scanners] 01/03: Add initial Discourse scanner plugin

Posted by hu...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git

commit 91b41080265f81871915ef5b79a2e2e7d08b2624
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Mon Feb 26 20:19:32 2018 +0100

    Add initial Discourse scanner plugin
---
 src/plugins/scanners/__init__.py  |   3 +-
 src/plugins/scanners/discourse.py | 295 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 297 insertions(+), 1 deletion(-)

diff --git a/src/plugins/scanners/__init__.py b/src/plugins/scanners/__init__.py
index 3a70c40..5bc02a6 100644
--- a/src/plugins/scanners/__init__.py
+++ b/src/plugins/scanners/__init__.py
@@ -38,7 +38,8 @@ __all__ = [
     'gerrit',
     'jenkins',
     'buildbot',
-    'travis'
+    'travis',
+    'discourse'
     ]
 
 # Import each plugin into a hash called 'scanners'
diff --git a/src/plugins/scanners/discourse.py b/src/plugins/scanners/discourse.py
new file mode 100644
index 0000000..42baf1e
--- /dev/null
+++ b/src/plugins/scanners/discourse.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+ #the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import datetime
+import re
+import json
+import hashlib
+import plugins.utils.jsonapi
+import threading
+import requests.exceptions
+import os
+
+"""
+This is the Kibble Discourse scanner plugin.
+"""
+
+title = "Scanner for Discourse Forums"
+version = "0.1.0"
+
+def accepts(source):
+    """ Determines whether we want to handle this source """
+    if source['type'] == 'discourse':
+        return True
+    return False
+
+
+def scanJob(KibbleBit, source, cat, creds):
+    """ Scans a single discourse category for activity """
+    NOW = int(datetime.datetime.utcnow().timestamp())
+    #dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], job['name']) ).encode('ascii', errors='replace')).hexdigest()
+    
+    # Get $discourseURL/c/$catID
+    
+    catURL = os.path.join(source['sourceURL'], "c/%s" % cat['id'])
+    KibbleBit.pprint("Scanning Discourse category '%s' at %s" % (cat['slug'], catURL))
+    
+    page = 1
+    allUsers = {}
+    
+    # For each paginated result (up to page 100), check for changes
+    while page < 100:
+        pcatURL = "%s?page=%u" % (catURL, page)
+        catjson = plugins.utils.jsonapi.get(pcatURL, auth = creds)
+        page += 1
+    
+        
+        if catjson:
+            
+            # If we hit an empty list (no more topics), just break the loop.
+            if not catjson['topic_list']['topics']:
+                break
+            
+            # First (if we have data), we should store the known users       
+            # Since discourse hides the email (obviously!), we'll have to
+            # fake one to generate an account.
+            fakeDomain = "foo.discourse"
+            m = re.match(r"https?://([-a-zA-Z0-9.]+)", source['sourceURL'])
+            if m:
+                fakeDomain = m.group(1)
+            for user in catjson['users']:
+                # Fake email address, compute deterministic ID
+                email = "%s@%s" % (user['username'], fakeDomain)
+                dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
+                
+                # Construct a very sparse user document
+                userDoc = {
+                    'id': dhash,
+                    'organisation': source['organisation'],
+                    'name': user['username'],
+                    'email': email,
+                }
+                
+                # Store user-ID-to-username mapping for later
+                allUsers[user['id']] = userDoc
+                
+                # Store it (or, queue storage)
+                KibbleBit.append('person', userDoc)
+            
+            # Now, for each topic, we'll store a topic document
+            for topic in catjson['topic_list']['topics']:
+                
+                # Calculate topic ID
+                dhash = hashlib.sha224( ("%s-%s-topic-%s" % (source['organisation'], source['sourceURL'], topic['id']) ).encode('ascii', errors='replace')).hexdigest()
+                
+                # Figure out when topic was created and updated
+                CreatedDate = datetime.datetime.strptime(topic['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
+                if topic.get('last_posted_at'):
+                    UpdatedDate = datetime.datetime.strptime(topic['last_posted_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
+                else:
+                    UpdatedDate = 0
+                
+                # Determine whether we should scan this topic or continue to the next one.
+                # We'll do this by seeing if the topic already exists and has no changes or not.
+                if KibbleBit.exists('forum_topic', dhash):
+                    fdoc = KibbleBit.get('forum_topic', dhash)
+                    # If update in the old doc was >= current update timestamp, skip the topic
+                    if fdoc['updated'] >= UpdatedDate:
+                        continue
+                
+                
+                # Assuming we need to scan this, start by making the base topic document
+                topicdoc = {
+                    'id': dhash,
+                    'sourceID': source['sourceID'],
+                    'organisation': source['organisation'],
+                    
+                    'category': cat['slug'],
+                    'title': topic['title'],
+                    'creator': allUsers[topic['posters'][0]['user_id']]['id'],
+                    'creatorName': allUsers[topic['posters'][0]['user_id']]['name'],
+                    'created': CreatedDate,
+                    'updated': UpdatedDate,
+                    'solved': False,    # Discourse doesn't have this notion, but other forums might.
+                    'posts': topic['posts_count'],
+                    'views': topic['views'],
+                    'url': source['sourceURL'] + "/t/%s/%s" % (topic['slug'], topic['id'])
+                }
+                
+                KibbleBit.append('forum_topic', topicdoc)
+                KibbleBit.pprint("%s is new or changed, scanning" % topicdoc['url'])
+                
+                # Now grab all the individual replies/posts
+                # Remember to not have it count as a visit!
+                pURL = "%s?track_visit=false&forceLoad=true" % topicdoc['url']
+                pjson = plugins.utils.jsonapi.get(pURL, auth = creds)
+                
+                posts = pjson['post_stream']['posts']
+                
+                # For each post/reply, construct a forum_entry document
+                KibbleBit.pprint("%s has %u posts" % (pURL, len(posts)))
+                for post in posts:
+                    phash = hashlib.sha224( ("%s-%s-post-%s" % (source['organisation'], source['sourceURL'], post['id']) ).encode('ascii', errors='replace')).hexdigest()
+                    
+                    # Find the hash of the person who posted it
+                    # We may know them, or we may have to store them
+                    if post['user_id'] in allUsers:
+                        uhash = allUsers[post['user_id']]['id']
+                    else:
+                        # Same as before, fake email, store...
+                        email = "%s@%s" % (post['username'], fakeDomain)
+                        uhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
+                        
+                        # Construct a very sparse user document
+                        userDoc = {
+                            'id': uhash,
+                            'organisation': source['organisation'],
+                            'name': post['username'],
+                            'email': email,
+                        }
+                        
+                        # Store user-ID-to-username mapping for later
+                        allUsers[user['id']] = userDoc
+                        
+                        # Store it (or, queue storage)
+                        KibbleBit.append('person', userDoc)
+                    
+                    # Get post date
+                    CreatedDate = datetime.datetime.strptime(post['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
+                    
+                    # Store the post/reply document
+                    pdoc = {
+                        'id': phash,
+                        'sourceID': source['sourceID'],
+                        'organisation': source['organisation'],
+                        
+                        'created': CreatedDate,
+                        'topic': dhash,
+                        'post_id': post['id'],
+                        'text': post['cooked'],
+                        'url': topicdoc['url']
+                    }
+            
+        return True    
+    
+    # Boo, it failed!
+    KibbleBit.pprint("Fetching job data failed!")
+    return False
+
+
+class discourseThread(threading.Thread):
+    """ Generic thread class for scheduling multiple scans at once """
+    def __init__(self, block, KibbleBit, source, creds, jobs):
+        super(discourseThread, self).__init__()
+        self.block = block
+        self.KibbleBit = KibbleBit
+        self.creds = creds
+        self.source = source
+        self.jobs = jobs
+        
+    def run(self):
+        badOnes = 0
+        while len(self.jobs) > 0 and badOnes <= 50:
+            self.block.acquire()
+            try:
+                job = self.jobs.pop(0)
+            except Exception as err:
+                self.block.release()
+                return
+            if not job:
+                self.block.release()
+                return
+            self.block.release()
+            if not scanJob(self.KibbleBit, self.source, job, self.creds):
+                self.KibbleBit.pprint("[%s] This borked, trying another one" % job['name'])
+                badOnes += 1
+                if badOnes > 10:
+                    self.KibbleBit.pprint("Too many errors, bailing!")
+                    self.source['steps']['forum'] = {
+                        'time': time.time(),
+                        'status': 'Too many errors while parsing at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
+                        'running': False,
+                        'good': False
+                    }
+                    self.KibbleBit.updateSource(self.source)
+                    return
+            else:
+                badOnes = 0
+
+def scan(KibbleBit, source):
+    # Simple URL check
+    discourse = re.match(r"(https?://.+)", source['sourceURL'])
+    if discourse:
+        
+        source['steps']['forum'] = {
+            'time': time.time(),
+            'status': 'Parsing Discourse topics...',
+            'running': True,
+            'good': True
+        }
+        KibbleBit.updateSource(source)
+        
+        badOnes = 0
+        pendingJobs = []
+        KibbleBit.pprint("Parsing Discourse activity at %s" % source['sourceURL'])
+        source['steps']['forum'] = {
+            'time': time.time(),
+            'status': 'Downloading changeset',
+            'running': True,
+            'good': True
+        }
+        KibbleBit.updateSource(source)
+        
+        # Discourse may neeed credentials (if basic auth)
+        creds = None
+        if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
+            creds = "%s:%s" % (source['creds']['username'], source['creds']['password'])
+            
+        # Get the list of categories
+        sURL = source['sourceURL']
+        KibbleBit.pprint("Getting categories...")
+        catjs = plugins.utils.jsonapi.get("%s/categories_and_latest" % sURL , auth = creds)
+        
+        # Directly assign the category list as pending jobs queue, ezpz.
+        pendingJobs = catjs['category_list']['categories']
+        
+        KibbleBit.pprint("Found %u categories" % len(pendingJobs))
+        
+        # Now fire off 4 threads to parse the categories
+        threads = []
+        block = threading.Lock()
+        KibbleBit.pprint("Scanning jobs using 4 sub-threads")
+        for i in range(0,4):
+            t = discourseThread(block, KibbleBit, source, creds, pendingJobs)
+            threads.append(t)
+            t.start()
+        
+        for t in threads:
+            t.join()
+
+        # We're all done, yaay        
+        KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
+
+        source['steps']['forum'] = {
+            'time': time.time(),
+            'status': 'Discourse successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
+            'running': False,
+            'good': True
+        }
+        KibbleBit.updateSource(source)
+    
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
humbedooh@apache.org.