You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kibble.apache.org by hu...@apache.org on 2018/02/26 19:21:41 UTC

[kibble-scanners] 01/03: Add initial Discourse scanner plugin

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git

commit 91b41080265f81871915ef5b79a2e2e7d08b2624
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Mon Feb 26 20:19:32 2018 +0100

    Add initial Discourse scanner plugin
---
 src/plugins/scanners/__init__.py  |   3 +-
 src/plugins/scanners/discourse.py | 295 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 297 insertions(+), 1 deletion(-)

diff --git a/src/plugins/scanners/__init__.py b/src/plugins/scanners/__init__.py
index 3a70c40..5bc02a6 100644
--- a/src/plugins/scanners/__init__.py
+++ b/src/plugins/scanners/__init__.py
@@ -38,7 +38,8 @@ __all__ = [
     'gerrit',
     'jenkins',
     'buildbot',
-    'travis'
+    'travis',
+    'discourse'
     ]
 
 # Import each plugin into a hash called 'scanners'
diff --git a/src/plugins/scanners/discourse.py b/src/plugins/scanners/discourse.py
new file mode 100644
index 0000000..42baf1e
--- /dev/null
+++ b/src/plugins/scanners/discourse.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+ #the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import datetime
+import re
+import json
+import hashlib
+import plugins.utils.jsonapi
+import threading
+import requests.exceptions
+import os
+
+"""
+This is the Kibble Discourse scanner plugin.
+"""
+
+title = "Scanner for Discourse Forums"
+version = "0.1.0"
+
+def accepts(source):
+    """ Determines whether we want to handle this source """
+    if source['type'] == 'discourse':
+        return True
+    return False
+
+
+def scanJob(KibbleBit, source, cat, creds):
+    """ Scans a single discourse category for activity """
+    NOW = int(datetime.datetime.utcnow().timestamp())
+    #dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], job['name']) ).encode('ascii', errors='replace')).hexdigest()
+    
+    # Get $discourseURL/c/$catID
+    
+    catURL = os.path.join(source['sourceURL'], "c/%s" % cat['id'])
+    KibbleBit.pprint("Scanning Discourse category '%s' at %s" % (cat['slug'], catURL))
+    
+    page = 1
+    allUsers = {}
+    
+    # For each paginated result (up to page 100), check for changes
+    while page < 100:
+        pcatURL = "%s?page=%u" % (catURL, page)
+        catjson = plugins.utils.jsonapi.get(pcatURL, auth = creds)
+        page += 1
+    
+        
+        if catjson:
+            
+            # If we hit an empty list (no more topics), just break the loop.
+            if not catjson['topic_list']['topics']:
+                break
+            
+            # First (if we have data), we should store the known users       
+            # Since discourse hides the email (obviously!), we'll have to
+            # fake one to generate an account.
+            fakeDomain = "foo.discourse"
+            m = re.match(r"https?://([-a-zA-Z0-9.]+)", source['sourceURL'])
+            if m:
+                fakeDomain = m.group(1)
+            for user in catjson['users']:
+                # Fake email address, compute deterministic ID
+                email = "%s@%s" % (user['username'], fakeDomain)
+                dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
+                
+                # Construct a very sparse user document
+                userDoc = {
+                    'id': dhash,
+                    'organisation': source['organisation'],
+                    'name': user['username'],
+                    'email': email,
+                }
+                
+                # Store user-ID-to-username mapping for later
+                allUsers[user['id']] = userDoc
+                
+                # Store it (or, queue storage)
+                KibbleBit.append('person', userDoc)
+            
+            # Now, for each topic, we'll store a topic document
+            for topic in catjson['topic_list']['topics']:
+                
+                # Calculate topic ID
+                dhash = hashlib.sha224( ("%s-%s-topic-%s" % (source['organisation'], source['sourceURL'], topic['id']) ).encode('ascii', errors='replace')).hexdigest()
+                
+                # Figure out when topic was created and updated
+                CreatedDate = datetime.datetime.strptime(topic['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
+                if topic.get('last_posted_at'):
+                    UpdatedDate = datetime.datetime.strptime(topic['last_posted_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
+                else:
+                    UpdatedDate = 0
+                
+                # Determine whether we should scan this topic or continue to the next one.
+                # We'll do this by seeing if the topic already exists and has no changes or not.
+                if KibbleBit.exists('forum_topic', dhash):
+                    fdoc = KibbleBit.get('forum_topic', dhash)
+                    # If update in the old doc was >= current update timestamp, skip the topic
+                    if fdoc['updated'] >= UpdatedDate:
+                        continue
+                
+                
+                # Assuming we need to scan this, start by making the base topic document
+                topicdoc = {
+                    'id': dhash,
+                    'sourceID': source['sourceID'],
+                    'organisation': source['organisation'],
+                    
+                    'category': cat['slug'],
+                    'title': topic['title'],
+                    'creator': allUsers[topic['posters'][0]['user_id']]['id'],
+                    'creatorName': allUsers[topic['posters'][0]['user_id']]['name'],
+                    'created': CreatedDate,
+                    'updated': UpdatedDate,
+                    'solved': False,    # Discourse doesn't have this notion, but other forums might.
+                    'posts': topic['posts_count'],
+                    'views': topic['views'],
+                    'url': source['sourceURL'] + "/t/%s/%s" % (topic['slug'], topic['id'])
+                }
+                
+                KibbleBit.append('forum_topic', topicdoc)
+                KibbleBit.pprint("%s is new or changed, scanning" % topicdoc['url'])
+                
+                # Now grab all the individual replies/posts
+                # Remember to not have it count as a visit!
+                pURL = "%s?track_visit=false&forceLoad=true" % topicdoc['url']
+                pjson = plugins.utils.jsonapi.get(pURL, auth = creds)
+                
+                posts = pjson['post_stream']['posts']
+                
+                # For each post/reply, construct a forum_entry document
+                KibbleBit.pprint("%s has %u posts" % (pURL, len(posts)))
+                for post in posts:
+                    phash = hashlib.sha224( ("%s-%s-post-%s" % (source['organisation'], source['sourceURL'], post['id']) ).encode('ascii', errors='replace')).hexdigest()
+                    
+                    # Find the hash of the person who posted it
+                    # We may know them, or we may have to store them
+                    if post['user_id'] in allUsers:
+                        uhash = allUsers[post['user_id']]['id']
+                    else:
+                        # Same as before, fake email, store...
+                        email = "%s@%s" % (post['username'], fakeDomain)
+                        uhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
+                        
+                        # Construct a very sparse user document
+                        userDoc = {
+                            'id': uhash,
+                            'organisation': source['organisation'],
+                            'name': post['username'],
+                            'email': email,
+                        }
+                        
+                        # Store user-ID-to-username mapping for later
+                        allUsers[user['id']] = userDoc
+                        
+                        # Store it (or, queue storage)
+                        KibbleBit.append('person', userDoc)
+                    
+                    # Get post date
+                    CreatedDate = datetime.datetime.strptime(post['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
+                    
+                    # Store the post/reply document
+                    pdoc = {
+                        'id': phash,
+                        'sourceID': source['sourceID'],
+                        'organisation': source['organisation'],
+                        
+                        'created': CreatedDate,
+                        'topic': dhash,
+                        'post_id': post['id'],
+                        'text': post['cooked'],
+                        'url': topicdoc['url']
+                    }
+            
+        return True    
+    
+    # Boo, it failed!
+    KibbleBit.pprint("Fetching job data failed!")
+    return False
+
+
+class discourseThread(threading.Thread):
+    """ Generic thread class for scheduling multiple scans at once """
+    def __init__(self, block, KibbleBit, source, creds, jobs):
+        super(discourseThread, self).__init__()
+        self.block = block
+        self.KibbleBit = KibbleBit
+        self.creds = creds
+        self.source = source
+        self.jobs = jobs
+        
+    def run(self):
+        badOnes = 0
+        while len(self.jobs) > 0 and badOnes <= 50:
+            self.block.acquire()
+            try:
+                job = self.jobs.pop(0)
+            except Exception as err:
+                self.block.release()
+                return
+            if not job:
+                self.block.release()
+                return
+            self.block.release()
+            if not scanJob(self.KibbleBit, self.source, job, self.creds):
+                self.KibbleBit.pprint("[%s] This borked, trying another one" % job['name'])
+                badOnes += 1
+                if badOnes > 10:
+                    self.KibbleBit.pprint("Too many errors, bailing!")
+                    self.source['steps']['forum'] = {
+                        'time': time.time(),
+                        'status': 'Too many errors while parsing at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
+                        'running': False,
+                        'good': False
+                    }
+                    self.KibbleBit.updateSource(self.source)
+                    return
+            else:
+                badOnes = 0
+
+def scan(KibbleBit, source):
+    # Simple URL check
+    discourse = re.match(r"(https?://.+)", source['sourceURL'])
+    if discourse:
+        
+        source['steps']['forum'] = {
+            'time': time.time(),
+            'status': 'Parsing Discourse topics...',
+            'running': True,
+            'good': True
+        }
+        KibbleBit.updateSource(source)
+        
+        badOnes = 0
+        pendingJobs = []
+        KibbleBit.pprint("Parsing Discourse activity at %s" % source['sourceURL'])
+        source['steps']['forum'] = {
+            'time': time.time(),
+            'status': 'Downloading changeset',
+            'running': True,
+            'good': True
+        }
+        KibbleBit.updateSource(source)
+        
+        # Discourse may neeed credentials (if basic auth)
+        creds = None
+        if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
+            creds = "%s:%s" % (source['creds']['username'], source['creds']['password'])
+            
+        # Get the list of categories
+        sURL = source['sourceURL']
+        KibbleBit.pprint("Getting categories...")
+        catjs = plugins.utils.jsonapi.get("%s/categories_and_latest" % sURL , auth = creds)
+        
+        # Directly assign the category list as pending jobs queue, ezpz.
+        pendingJobs = catjs['category_list']['categories']
+        
+        KibbleBit.pprint("Found %u categories" % len(pendingJobs))
+        
+        # Now fire off 4 threads to parse the categories
+        threads = []
+        block = threading.Lock()
+        KibbleBit.pprint("Scanning jobs using 4 sub-threads")
+        for i in range(0,4):
+            t = discourseThread(block, KibbleBit, source, creds, pendingJobs)
+            threads.append(t)
+            t.start()
+        
+        for t in threads:
+            t.join()
+
+        # We're all done, yaay        
+        KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
+
+        source['steps']['forum'] = {
+            'time': time.time(),
+            'status': 'Discourse successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
+            'running': False,
+            'good': True
+        }
+        KibbleBit.updateSource(source)
+    
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
humbedooh@apache.org.