You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kibble.apache.org by tu...@apache.org on 2020/12/13 15:56:19 UTC
[kibble] branch main updated: Make scanners use kibble.ini instead
of config.yaml (#122)
This is an automated email from the ASF dual-hosted git repository.
turbaszek pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/kibble.git
The following commit(s) were added to refs/heads/main by this push:
new e19cf6c Make scanners use kibble.ini instead of config.yaml (#122)
e19cf6c is described below
commit e19cf6cc4de4c20e70da5de5ef37cb3c1668dbe3
Author: Tomek Urbaszek <tu...@gmail.com>
AuthorDate: Sun Dec 13 15:56:12 2020 +0000
Make scanners use kibble.ini instead of config.yaml (#122)
This PR moved watson, azure, picoapi and git configuration to
kibble.ini config file. It fixes also reading ES configuration.
In this way all Kibble configuration is in single place.
---
kibble.ini | 20 +++
kibble/scanners/README.md | 2 +-
kibble/scanners/brokers/kibbleES.py | 18 +-
kibble/scanners/config.yaml | 39 ----
kibble/scanners/scanners/git-census.py | 4 +-
kibble/scanners/scanners/git-evolution.py | 5 +-
kibble/scanners/scanners/git-sloc.py | 5 +-
kibble/scanners/scanners/git-sync.py | 5 +-
kibble/scanners/scanners/ponymail-kpe.py | 13 +-
kibble/scanners/scanners/ponymail-tone.py | 20 +--
kibble/scanners/utils/git.py | 17 +-
kibble/scanners/utils/kpe.py | 218 +++++++++++-----------
kibble/scanners/utils/tone.py | 289 +++++++++++++++---------------
kibble/settings.py | 6 +
14 files changed, 324 insertions(+), 337 deletions(-)
diff --git a/kibble.ini b/kibble.ini
index 72f0e0c..8e4f042 100644
--- a/kibble.ini
+++ b/kibble.ini
@@ -25,6 +25,26 @@ scratchdir = /tmp
# each node will gat 1/4th of all jobs to work on.
balance =
+[git]
+# Comma-separated branch names
+wanted_branches =
+
+# Watson/BlueMix configuration for sentiment analysis, if applicable
+[watson]
+username =
+password =
+api = https://gateway-location.watsonplatform.net/tone-analyzer/api
+
+# Azure Text Analysis API configuration, if applicable
+[azure]
+apikey =
+location = west-us
+
+# picoAPI Text Analysis configuration
+[picoapi]
+key =
+
+
[elasticsearch]
# Elasticsearch database name
dbname = kibble
diff --git a/kibble/scanners/README.md b/kibble/scanners/README.md
index 87da385..5746a11 100644
--- a/kibble/scanners/README.md
+++ b/kibble/scanners/README.md
@@ -3,7 +3,7 @@ The Kibble Scanners collect information for the Kibble Suite.
## Setup instructions:
- - Edit conf/config.yaml to match your Kibble service
+ - Edit kibble.ini to match your Kibble service
## How to run:
diff --git a/kibble/scanners/brokers/kibbleES.py b/kibble/scanners/brokers/kibbleES.py
index 5e6eb0d..5ee92d2 100644
--- a/kibble/scanners/brokers/kibbleES.py
+++ b/kibble/scanners/brokers/kibbleES.py
@@ -119,14 +119,13 @@ class KibbleBit:
""" KibbleBit class with direct ElasticSearch access """
def __init__(self, broker, organisation, tid):
- self.config = broker.config
self.organisation = organisation
self.broker = broker
self.json_queue = []
self.queueMax = 1000 # Entries to keep before bulk pushing
self.pluginname = ""
self.tid = tid
- self.dbname = self.broker.config["elasticsearch"]["database"]
+ self.dbname = conf.get("elasticsearch", "database")
def __del__(self):
""" On unload/delete, push the last chunks of data to ES """
@@ -144,7 +143,7 @@ class KibbleBit:
def update_source(self, source):
""" Updates a source document, usually with a status update """
self.broker.DB.index(
- index=self.broker.config["elasticsearch"]["database"],
+ index=self.dbname,
doc_type="source",
id=source["sourceID"],
body=source,
@@ -153,7 +152,7 @@ class KibbleBit:
def get(self, doctype, docid):
""" Fetches a document from the DB """
doc = self.broker.DB.get(
- index=self.broker.config["elasticsearch"]["database"],
+ index=self.dbname,
doc_type=doctype,
id=docid,
)
@@ -164,14 +163,14 @@ class KibbleBit:
def exists(self, doctype, docid):
""" Checks whether a document already exists or not """
return self.broker.DB.exists(
- index=self.broker.config["elasticsearch"]["database"],
+ index=self.dbname,
doc_type=doctype,
id=docid,
)
def index(self, doctype, docid, document):
""" Adds a new document to the index """
- dbname = self.broker.config["elasticsearch"]["database"]
+ dbname = self.dbname
self.broker.DB.index(index=dbname, doc_type=doctype, id=docid, body=document)
def append(self, t, doc):
@@ -195,7 +194,7 @@ class KibbleBit:
js = entry
doc = js
js["@version"] = 1
- dbname = self.broker.config["elasticsearch"]["database"]
+ dbname = self.dbname
if self.broker.noTypes:
dbname += "_%s" % js["doctype"]
js_arr.append(
@@ -233,6 +232,7 @@ class KibbleOrganisation:
self.broker = broker
self.id = org
+ self.dbname = conf.get("elasticsearch", "database")
def sources(self, sourceType=None, view=None):
""" Get all sources or sources of a specific type for an org """
@@ -241,7 +241,7 @@ class KibbleOrganisation:
mustArray = [{"term": {"organisation": self.id}}]
if view:
res = self.broker.DB.get(
- index=self.broker.config["elasticsearch"]["database"],
+ index=self.dbname,
doc_type="view",
id=view,
)
@@ -252,7 +252,7 @@ class KibbleOrganisation:
mustArray.append({"term": {"type": sourceType}})
# Run the search, fetch all results, 9999 max. TODO: Scroll???
res = self.broker.DB.search(
- index=self.broker.config["elasticsearch"]["database"],
+ index=self.dbname,
doc_type="source",
size=9999,
body={"query": {"bool": {"must": mustArray}}, "sort": {"sourceURL": "asc"}},
diff --git a/kibble/scanners/config.yaml b/kibble/scanners/config.yaml
deleted file mode 100644
index d835539..0000000
--- a/kibble/scanners/config.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# If enabled, kibble scanners will use direct ES connection.
-elasticsearch:
- enabled: true
- hostname: localhost
- port: 9200
- ssl: false
- uri: ""
- database: kibble
-
-# If enabled, kibble scanners will use the HTTP JSON API
-broker:
- enabled: false
- url: https://localhost/api/
- auth:
- username: kibble
- password: kibble4life
-
-# Scanner client options
-scanner:
- # scratchdir: Location for storing file objects like git repos etc
- # This should be permanent to speed up scans of large repositories
- # on consecutive scans, but may be ephemeral like /tmp
- scratchdir: /tmp
- # If you are load balancing the scans, you should specify
- # how many nodes are working, and which one you are,
- # using the format: $nodeNo/$totalNodes. If there are 4 nodes,
- # each node will gat 1/4th of all jobs to work on.
- #balance: 1/4
-
-# Watson/BlueMix configuration for sentiment analysis, if applicable
-#watson:
-# username: uuid-here
-# password: pass-here
-# api: https://gateway-location.watsonplatform.net/tone-analyzer/api
-
-# Azure Text Analysis API configuration, if applicable
-#azure:
-# apikey: key-here
-# location: west-us
diff --git a/kibble/scanners/scanners/git-census.py b/kibble/scanners/scanners/git-census.py
index 790ae49..b92f869 100644
--- a/kibble/scanners/scanners/git-census.py
+++ b/kibble/scanners/scanners/git-census.py
@@ -24,6 +24,8 @@ import subprocess
import tempfile
import time
+from kibble.configuration import conf
+
title = "Census Scanner for Git"
version = "0.1.0"
@@ -50,7 +52,7 @@ def scan(kibble_bit, source):
rid = source["sourceID"]
url = source["sourceURL"]
rootpath = "%s/%s/git" % (
- kibble_bit.config["scanner"]["scratchdir"],
+ conf.get("scanner", "scratchdir"),
source["organisation"],
)
gpath = os.path.join(rootpath, rid)
diff --git a/kibble/scanners/scanners/git-evolution.py b/kibble/scanners/scanners/git-evolution.py
index b533b4b..8f4a836 100644
--- a/kibble/scanners/scanners/git-evolution.py
+++ b/kibble/scanners/scanners/git-evolution.py
@@ -23,6 +23,7 @@ import os
import subprocess
import time
+from kibble.configuration import conf
from kibble.scanners.utils import sloc
title = "Git Evolution Scanner"
@@ -138,7 +139,7 @@ def scan(kibble_bit, source):
rid = source["sourceID"]
rootpath = "%s/%s/git" % (
- kibble_bit.config["scanner"]["scratchdir"],
+ conf.get("scanner", "scratchdir"),
source["organisation"],
)
gpath = os.path.join(rootpath, rid)
@@ -158,7 +159,7 @@ def scan(kibble_bit, source):
rid = source["sourceID"]
url = source["sourceURL"]
rootpath = "%s/%s/git" % (
- kibble_bit.config["scanner"]["scratchdir"],
+ conf.get("scanner", "scratchdir"),
source["organisation"],
)
gpath = os.path.join(rootpath, rid)
diff --git a/kibble/scanners/scanners/git-sloc.py b/kibble/scanners/scanners/git-sloc.py
index 8b44c54..4ae13ac 100644
--- a/kibble/scanners/scanners/git-sloc.py
+++ b/kibble/scanners/scanners/git-sloc.py
@@ -19,6 +19,7 @@ import os
import subprocess
import time
+from kibble.configuration import conf
from kibble.scanners.utils import git, sloc
""" Source Lines of Code counter for Git """
@@ -42,7 +43,7 @@ def scan(kibble_bit, source):
rid = source["sourceID"]
url = source["sourceURL"]
rootpath = "%s/%s/git" % (
- kibble_bit.config["scanner"]["scratchdir"],
+ conf.get("scanner", "scratchdir"),
source["organisation"],
)
gpath = os.path.join(rootpath, rid)
@@ -58,7 +59,7 @@ def scan(kibble_bit, source):
kibble_bit.update_source(source)
try:
- branch = git.defaultBranch(source, gpath)
+ branch = git.default_branch(source, gpath)
subprocess.call("cd %s && git checkout %s" % (gpath, branch), shell=True)
except: # pylint: disable=bare-except
kibble_bit.pprint("SLoC counter failed to find main branch for %s!!" % url)
diff --git a/kibble/scanners/scanners/git-sync.py b/kibble/scanners/scanners/git-sync.py
index c1c40f6..aee39af 100644
--- a/kibble/scanners/scanners/git-sync.py
+++ b/kibble/scanners/scanners/git-sync.py
@@ -19,6 +19,7 @@ import os
import subprocess
import time
+from kibble.configuration import conf
from kibble.scanners.utils import git
title = "Sync plugin for Git repositories"
@@ -41,7 +42,7 @@ def scan(kibble_bit, source):
path = source["sourceID"]
url = source["sourceURL"]
rootpath = "%s/%s/git" % (
- kibble_bit.config["scanner"]["scratchdir"],
+ conf.get("scanner", "scratchdir"),
source["organisation"],
)
@@ -79,7 +80,7 @@ def scan(kibble_bit, source):
kibble_bit.pprint("Repo %s exists, fetching changes..." % datapath)
# Do we have a default branch here?
- branch = git.defaultBranch(source, datapath, kibble_bit)
+ branch = git.default_branch(source, datapath)
if len(branch) == 0:
source["default_branch"] = branch
source["steps"]["sync"] = {
diff --git a/kibble/scanners/scanners/ponymail-kpe.py b/kibble/scanners/scanners/ponymail-kpe.py
index 8db47c2..f7988ac 100644
--- a/kibble/scanners/scanners/ponymail-kpe.py
+++ b/kibble/scanners/scanners/ponymail-kpe.py
@@ -19,6 +19,7 @@ import re
import time
from kibble.scanners.utils import jsonapi, kpe
+from kibble.settings import AZURE_ENABLED, PICOAPI_ENABLED, WATSON_ENABLED
"""
This is a Kibble scanner plugin for Apache Pony Mail sources.
@@ -64,7 +65,7 @@ def scan(kibble_bit, source):
kibble_bit.update_source(source)
return
- if not "azure" in kibble_bit.config and not "picoapi" in kibble_bit.config:
+ if not AZURE_ENABLED and not PICOAPI_ENABLED:
kibble_bit.pprint(
"No Azure/picoAPI creds configured, skipping key phrase extraction"
)
@@ -110,12 +111,12 @@ def scan(kibble_bit, source):
bodies.append(body)
if bodies:
KPEs = None
- if "watson" in kibble_bit.config:
+ if WATSON_ENABLED:
pass # Haven't written this yet
- elif "azure" in kibble_bit.config:
- KPEs = kpe.azureKPE(kibble_bit, bodies)
- elif "picoapi" in kibble_bit.config:
- KPEs = kpe.picoKPE(kibble_bit, bodies)
+ elif AZURE_ENABLED:
+ KPEs = kpe.azure_kpe(kibble_bit, bodies)
+ elif PICOAPI_ENABLED:
+ KPEs = kpe.pico_kpe(kibble_bit, bodies)
if not KPEs:
kibble_bit.pprint("Hit rate limit, not trying further emails for now.")
diff --git a/kibble/scanners/scanners/ponymail-tone.py b/kibble/scanners/scanners/ponymail-tone.py
index 4ae9330..fe31217 100644
--- a/kibble/scanners/scanners/ponymail-tone.py
+++ b/kibble/scanners/scanners/ponymail-tone.py
@@ -21,7 +21,9 @@ This is a Kibble scanner plugin for Apache Pony Mail sources.
import re
import time
+from kibble.configuration import conf
from kibble.scanners.utils import jsonapi, tone
+from kibble.settings import AZURE_ENABLED, PICOAPI_ENABLED, WATSON_ENABLED
title = "Tone/Mood Scanner plugin for Apache Pony Mail"
version = "0.1.0"
@@ -61,11 +63,7 @@ def scan(kibble_bit, source):
kibble_bit.update_source(source)
return
- if (
- not "watson" in kibble_bit.config
- and not "azure" in kibble_bit.config
- and not "picoapi" in kibble_bit.config
- ):
+ if not WATSON_ENABLED and not AZURE_ENABLED and not PICOAPI_ENABLED:
kibble_bit.pprint(
"No Watson/Azure/picoAPI creds configured, skipping tone analyzer"
)
@@ -110,12 +108,12 @@ def scan(kibble_bit, source):
bodies.append(body)
if bodies:
moods = None
- if "watson" in kibble_bit.config:
- moods = tone.watsonTone(kibble_bit, bodies)
- elif "azure" in kibble_bit.config:
- moods = tone.azureTone(kibble_bit, bodies)
- elif "picoapi" in kibble_bit.config:
- moods = tone.picoTone(kibble_bit, bodies)
+ if WATSON_ENABLED:
+ moods = tone.watson_tone(kibble_bit, bodies)
+ elif AZURE_ENABLED:
+ moods = tone.azure_tone(kibble_bit, bodies)
+ elif PICOAPI_ENABLED:
+ moods = tone.pico_tone(kibble_bit, bodies)
if not moods:
kibble_bit.pprint("Hit rate limit, not trying further emails for now.")
diff --git a/kibble/scanners/utils/git.py b/kibble/scanners/utils/git.py
index c1f3cd7..2dd9099 100644
--- a/kibble/scanners/utils/git.py
+++ b/kibble/scanners/utils/git.py
@@ -17,21 +17,20 @@
""" This is the Kibble git utility plugin """
-import os
import re
import subprocess
-import sys
+from kibble.configuration import conf
-def defaultBranch(source, datapath, KibbleBit=None):
+
+def default_branch(source, datapath):
""" Tries to figure out what the main branch of a repo is """
- wanted_branches = ["master", "main", "trunk"]
- branch = ""
# If we have an override of branches we like, use 'em
- if KibbleBit and KibbleBit.config.get("git"):
- wanted_branches = KibbleBit.config["git"].get(
- "wanted_branches", wanted_branches
- )
+ wanted_branches = conf.get("git", "wanted_branches", fallback=None)
+ if wanted_branches:
+ wanted_branches = wanted_branches.split(",")
+ else:
+ wanted_branches = ["master", "main", "trunk"]
# For each wanted branch, in order, look for it in our clone,
# and return the name if found.
diff --git a/kibble/scanners/utils/kpe.py b/kibble/scanners/utils/kpe.py
index 7eae14b..799bbb8 100644
--- a/kibble/scanners/utils/kpe.py
+++ b/kibble/scanners/utils/kpe.py
@@ -19,16 +19,16 @@
This is an experimental key phrase extraction plugin for using
Azure/picoAPI for analyzing the key elements of an email on a list. This
requires an account with a text analysis service provider, and a
-corresponding API section in config.yaml, as such:
+corresponding API section in kibble.ini, as such:
# picoAPI example:
-picoapi:
- key: abcdef1234567890
+[picoapi]
+key = abcdef1234567890
# Azure example:
-azure:
- apikey: abcdef1234567890
- location: westeurope
+[azure]
+apikey = abcdef1234567890
+location = westeurope
Currently only pony mail is supported. more to come.
"""
@@ -38,8 +38,10 @@ import re
import requests
+from kibble.configuration import conf
-def trimBody(body):
+
+def trim_body(body):
""" Quick function for trimming away the fat from emails """
# Cut away "On $date, jane doe wrote: " kind of texts
body = re.sub(
@@ -62,108 +64,104 @@ def trimBody(body):
return body
-def azureKPE(KibbleBit, bodies):
+def azure_kpe(kibble_bit, bodies):
""" KPE using Azure Text Analysis API """
- if "azure" in KibbleBit.config:
- headers = {
- "Content-Type": "application/json",
- "Ocp-Apim-Subscription-Key": KibbleBit.config["azure"]["apikey"],
- }
-
- js = {"documents": []}
-
- # For each body...
- a = 0
- KPEs = []
- for body in bodies:
- # Crop out quotes
- body = trimBody(body)
- doc = {"language": "en", "id": str(a), "text": body}
- js["documents"].append(doc)
- KPEs.append({}) # placeholder for each doc, to be replaced
- a += 1
- try:
- rv = requests.post(
- "https://%s.api.cognitive.microsoft.com/text/analytics/v2.0/keyPhrases"
- % KibbleBit.config["azure"]["location"],
- headers=headers,
- data=json.dumps(js),
- )
- jsout = rv.json()
- except:
- jsout = {} # borked sentiment analysis?
-
- if "documents" in jsout and len(jsout["documents"]) > 0:
- for doc in jsout["documents"]:
- KPEs[int(doc["id"])] = doc["keyPhrases"][
- :5
- ] # Replace KPEs[X] with the actual phrases, 5 first ones.
-
- else:
- KibbleBit.pprint("Failed to analyze email body.")
- print(jsout)
- # Depending on price tier, Azure will return a 429 if you go too fast.
- # If we see a statusCode return, let's just stop for now.
- # Later scans can pick up the slack.
- if "statusCode" in jsout:
- KibbleBit.pprint("Possible rate limiting in place, stopping for now.")
- return False
- return KPEs
-
-
-def picoKPE(KibbleBit, bodies):
+ headers = {
+ "Content-Type": "application/json",
+ "Ocp-Apim-Subscription-Key": conf.get("azure", "apikey"),
+ }
+
+ js = {"documents": []}
+
+ # For each body...
+ a = 0
+ KPEs = []
+ for body in bodies:
+ # Crop out quotes
+ body = trim_body(body)
+ doc = {"language": "en", "id": str(a), "text": body}
+ js["documents"].append(doc)
+ KPEs.append({}) # placeholder for each doc, to be replaced
+ a += 1
+ try:
+ rv = requests.post(
+ "https://%s.api.cognitive.microsoft.com/text/analytics/v2.0/keyPhrases"
+ % conf.get("azure", "location"),
+ headers=headers,
+ data=json.dumps(js),
+ )
+ jsout = rv.json()
+ except:
+ jsout = {} # borked sentiment analysis?
+
+ if "documents" in jsout and len(jsout["documents"]) > 0:
+ for doc in jsout["documents"]:
+ KPEs[int(doc["id"])] = doc["keyPhrases"][
+ :5
+ ] # Replace KPEs[X] with the actual phrases, 5 first ones.
+
+ else:
+ kibble_bit.pprint("Failed to analyze email body.")
+ print(jsout)
+ # Depending on price tier, Azure will return a 429 if you go too fast.
+ # If we see a statusCode return, let's just stop for now.
+ # Later scans can pick up the slack.
+ if "statusCode" in jsout:
+ kibble_bit.pprint("Possible rate limiting in place, stopping for now.")
+ return False
+ return KPEs
+
+
+def pico_kpe(kibble_bit, bodies):
""" KPE using picoAPI Text Analysis """
- if "picoapi" in KibbleBit.config:
- headers = {
- "Content-Type": "application/json",
- "PicoAPI-Key": KibbleBit.config["picoapi"]["key"],
- }
-
- js = {"texts": []}
-
- # For each body...
- a = 0
- KPEs = []
- for body in bodies:
- body = trimBody(body)
-
- doc = {"id": str(a), "body": body}
- js["texts"].append(doc)
- KPEs.append({}) # placeholder for each doc, to be replaced
- a += 1
- try:
- rv = requests.post(
- "https://v1.picoapi.com/api/text/keyphrase",
- headers=headers,
- data=json.dumps(js),
- )
- jsout = rv.json()
- except:
- jsout = {} # borked sentiment analysis?
-
- if "results" in jsout and len(jsout["results"]) > 0:
- for doc in jsout["results"]:
- phrases = []
- # This is a bit different than Azure, in that it has a weighting score
- # So we need to just extract key phrases above a certain level.
- # Grab up o 5 key phrases per text
- MINIMUM_WEIGHT = 0.02
- for element in doc["keyphrases"]:
- if element["score"] > MINIMUM_WEIGHT:
- phrases.append(element["phrase"])
- if len(phrases) == 5:
- break
- KPEs[
- int(doc["id"])
- ] = phrases # Replace KPEs[X] with the actual phrases
-
- else:
- KibbleBit.pprint("Failed to analyze email body.")
- print(jsout)
- # 403 returned on invalid key, 429 on rate exceeded.
- # If we see a code return, let's just stop for now.
- # Later scans can pick up the slack.
- if "code" in jsout:
- KibbleBit.pprint("Possible rate limiting in place, stopping for now.")
- return False
- return KPEs
+ headers = {
+ "Content-Type": "application/json",
+ "PicoAPI-Key": conf.get("picoapi", "key"),
+ }
+
+ js = {"texts": []}
+
+ # For each body...
+ a = 0
+ KPEs = []
+ for body in bodies:
+ body = trim_body(body)
+
+ doc = {"id": str(a), "body": body}
+ js["texts"].append(doc)
+ KPEs.append({}) # placeholder for each doc, to be replaced
+ a += 1
+ try:
+ rv = requests.post(
+ "https://v1.picoapi.com/api/text/keyphrase",
+ headers=headers,
+ data=json.dumps(js),
+ )
+ jsout = rv.json()
+ except:
+ jsout = {} # borked sentiment analysis?
+
+ if "results" in jsout and len(jsout["results"]) > 0:
+ for doc in jsout["results"]:
+ phrases = []
+ # This is a bit different than Azure, in that it has a weighting score
+ # So we need to just extract key phrases above a certain level.
+ # Grab up o 5 key phrases per text
+ MINIMUM_WEIGHT = 0.02
+ for element in doc["keyphrases"]:
+ if element["score"] > MINIMUM_WEIGHT:
+ phrases.append(element["phrase"])
+ if len(phrases) == 5:
+ break
+ KPEs[int(doc["id"])] = phrases # Replace KPEs[X] with the actual phrases
+
+ else:
+ kibble_bit.pprint("Failed to analyze email body.")
+ print(jsout)
+ # 403 returned on invalid key, 429 on rate exceeded.
+ # If we see a code return, let's just stop for now.
+ # Later scans can pick up the slack.
+ if "code" in jsout:
+ kibble_bit.pprint("Possible rate limiting in place, stopping for now.")
+ return False
+ return KPEs
diff --git a/kibble/scanners/utils/tone.py b/kibble/scanners/utils/tone.py
index c920b5e..df480a4 100644
--- a/kibble/scanners/utils/tone.py
+++ b/kibble/scanners/utils/tone.py
@@ -18,12 +18,12 @@
"""
This is an experimental tone analyzer plugin for using Watson/BlueMix for
analyzing the mood of email on a list. This requires a Watson account
-and a watson section in config.yaml, as such:
+and a watson section in kibble.ini, as such:
-watson:
- username: $user
- password: $pass
- api: https://$something.watsonplatform.net/tone-analyzer/api
+[watson]
+username = $user
+password = $pass
+api = https://$something.watsonplatform.net/tone-analyzer/api
Currently only pony mail is supported. more to come.
"""
@@ -32,160 +32,159 @@ import json
import requests
+from kibble.configuration import conf
-def watsonTone(KibbleBit, bodies):
- """ Sentiment analysis using IBM Watson """
- if "watson" in KibbleBit.config:
- headers = {"Content-Type": "application/json"}
- # Crop out quotes
- for body in bodies:
- lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
-
- js = {"text": body}
- try:
- rv = requests.post(
- "%s/v3/tone?version=2017-09-21&sentences=false"
- % KibbleBit.config["watson"]["api"],
- headers=headers,
- data=json.dumps(js),
- auth=(
- KibbleBit.config["watson"]["username"],
- KibbleBit.config["watson"]["password"],
- ),
- )
- jsout = rv.json()
- except:
- jsout = {} # borked Watson?
- mood = {}
- if "document_tone" in jsout:
- for tone in jsout["document_tone"]["tones"]:
- mood[tone["tone_id"]] = tone["score"]
- else:
- KibbleBit.pprint("Failed to analyze email body.")
- yield mood
+def watson_tone(kibble_bit, bodies):
+ """ Sentiment analysis using IBM Watson """
+ headers = {"Content-Type": "application/json"}
+ # Crop out quotes
+ for body in bodies:
+ lines = body.split("\n")
+ body = "\n".join([x for x in lines if not x.startswith(">")])
-def azureTone(KibbleBit, bodies):
- """ Sentiment analysis using Azure Text Analysis API """
- if "azure" in KibbleBit.config:
- headers = {
- "Content-Type": "application/json",
- "Ocp-Apim-Subscription-Key": KibbleBit.config["azure"]["apikey"],
- }
-
- js = {"documents": []}
-
- # For each body...
- a = 0
- moods = []
- for body in bodies:
- # Crop out quotes
- lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
- doc = {"language": "en", "id": str(a), "text": body}
- js["documents"].append(doc)
- moods.append({}) # placeholder for each doc, to be replaced
- a += 1
+ js = {"text": body}
try:
rv = requests.post(
- "https://%s.api.cognitive.microsoft.com/text/analytics/v2.0/sentiment"
- % KibbleBit.config["azure"]["location"],
+ "%s/v3/tone?version=2017-09-21&sentences=false"
+ % conf.get("watson", "api"),
headers=headers,
data=json.dumps(js),
+ auth=(
+ conf.get("watson", "username"),
+ conf.get("watson", "password"),
+ ),
)
jsout = rv.json()
except:
- jsout = {} # borked sentiment analysis?
-
- if "documents" in jsout and len(jsout["documents"]) > 0:
- for doc in jsout["documents"]:
- mood = {}
- # This is more parred than Watson, so we'll split it into three groups: positive, neutral and negative.
- # Divide into four segments, 0->40%, 25->75% and 60->100%.
- # 0-40 promotes negative, 60-100 promotes positive, and 25-75% promotes neutral.
- # As we don't want to over-represent negative/positive where the results are
- # muddy, the neutral zone is larger than the positive/negative zones by 10%.
- val = doc["score"]
- mood["negative"] = max(
- 0, ((0.4 - val) * 2.5)
- ) # For 40% and below, use 2½ distance
- mood["positive"] = max(
- 0, ((val - 0.6) * 2.5)
- ) # For 60% and above, use 2½ distance
- mood["neutral"] = max(
- 0, 1 - (abs(val - 0.5) * 2)
- ) # Between 25% and 75% use double the distance to middle.
- moods[int(doc["id"])] = mood # Replace moods[X] with the actual mood
-
+ jsout = {} # borked Watson?
+ mood = {}
+ if "document_tone" in jsout:
+ for tone in jsout["document_tone"]["tones"]:
+ mood[tone["tone_id"]] = tone["score"]
else:
- KibbleBit.pprint("Failed to analyze email body.")
- print(jsout)
- # Depending on price tier, Azure will return a 429 if you go too fast.
- # If we see a statusCode return, let's just stop for now.
- # Later scans can pick up the slack.
- if "statusCode" in jsout:
- KibbleBit.pprint("Possible rate limiting in place, stopping for now.")
- return False
- return moods
-
-
-def picoTone(KibbleBit, bodies):
- """ Sentiment analysis using picoAPI Text Analysis """
- if "picoapi" in KibbleBit.config:
- headers = {
- "Content-Type": "application/json",
- "PicoAPI-Key": KibbleBit.config["picoapi"]["key"],
- }
-
- js = {"texts": []}
-
- # For each body...
- a = 0
- moods = []
- for body in bodies:
- # Crop out quotes
- lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
- doc = {"id": str(a), "body": body}
- js["texts"].append(doc)
- moods.append({}) # placeholder for each doc, to be replaced
- a += 1
- try:
- rv = requests.post(
- "https://v1.picoapi.com/api/text/sentiment",
- headers=headers,
- data=json.dumps(js),
- )
- jsout = rv.json()
- except:
- jsout = {} # borked sentiment analysis?
+ kibble_bit.pprint("Failed to analyze email body.")
+ yield mood
- if "results" in jsout and len(jsout["results"]) > 0:
- for doc in jsout["results"]:
- mood = {
- "negative": doc["negativity"],
- "positive": doc["positivity"],
- "neutral": doc["neutrality"],
- }
- # Sentiment is the overall score, and we use that for the neutrality of a text
+def azure_tone(kibble_bit, bodies):
+ """ Sentiment analysis using Azure Text Analysis API """
+ headers = {
+ "Content-Type": "application/json",
+ "Ocp-Apim-Subscription-Key": conf.get("azure", "apikey"),
+ }
+
+ js = {"documents": []}
- # Additional (optional) emotion weighting
- if "emotions" in doc:
- for k, v in doc["emotions"].items():
- mood[k] = v / 100 # Value is between 0 and 100.
+ # For each body...
+ a = 0
+ moods = []
+ for body in bodies:
+ # Crop out quotes
+ lines = body.split("\n")
+ body = "\n".join([x for x in lines if not x.startswith(">")])
+ doc = {"language": "en", "id": str(a), "text": body}
+ js["documents"].append(doc)
+ moods.append({}) # placeholder for each doc, to be replaced
+ a += 1
+ try:
+ rv = requests.post(
+ "https://%s.api.cognitive.microsoft.com/text/analytics/v2.0/sentiment"
+ % conf.get("azure", "location"),
+ headers=headers,
+ data=json.dumps(js),
+ )
+ jsout = rv.json()
+ except:
+ jsout = {} # borked sentiment analysis?
+
+ if "documents" in jsout and len(jsout["documents"]) > 0:
+ for doc in jsout["documents"]:
+ mood = {}
+ # This is more parred than Watson, so we'll split it into three groups: positive, neutral and negative.
+ # Divide into four segments, 0->40%, 25->75% and 60->100%.
+ # 0-40 promotes negative, 60-100 promotes positive, and 25-75% promotes neutral.
+ # As we don't want to over-represent negative/positive where the results are
+ # muddy, the neutral zone is larger than the positive/negative zones by 10%.
+ val = doc["score"]
+ mood["negative"] = max(
+ 0, ((0.4 - val) * 2.5)
+ ) # For 40% and below, use 2½ distance
+ mood["positive"] = max(
+ 0, ((val - 0.6) * 2.5)
+ ) # For 60% and above, use 2½ distance
+ mood["neutral"] = max(
+ 0, 1 - (abs(val - 0.5) * 2)
+ ) # Between 25% and 75% use double the distance to middle.
+ moods[int(doc["id"])] = mood # Replace moods[X] with the actual mood
+
+ else:
+ kibble_bit.pprint("Failed to analyze email body.")
+ print(jsout)
+ # Depending on price tier, Azure will return a 429 if you go too fast.
+ # If we see a statusCode return, let's just stop for now.
+ # Later scans can pick up the slack.
+ if "statusCode" in jsout:
+ kibble_bit.pprint("Possible rate limiting in place, stopping for now.")
+ return False
+ return moods
+
+
+def pico_tone(kibble_bit, bodies):
+ """ Sentiment analysis using picoAPI Text Analysis """
+ headers = {
+ "Content-Type": "application/json",
+ "PicoAPI-Key": conf.get("picoapi", "key"),
+ }
- moods[int(doc["id"])] = mood # Replace moods[X] with the actual mood
+ js = {"texts": []}
- else:
- KibbleBit.pprint("Failed to analyze email body.")
- print(jsout)
- # 403 returned on invalid key, 429 on rate exceeded.
- # If we see a code return, let's just stop for now.
- # Later scans can pick up the slack.
- if "code" in jsout:
- KibbleBit.pprint("Possible rate limiting in place, stopping for now.")
- return False
- return moods
+ # For each body...
+ a = 0
+ moods = []
+ for body in bodies:
+ # Crop out quotes
+ lines = body.split("\n")
+ body = "\n".join([x for x in lines if not x.startswith(">")])
+ doc = {"id": str(a), "body": body}
+ js["texts"].append(doc)
+ moods.append({}) # placeholder for each doc, to be replaced
+ a += 1
+ try:
+ rv = requests.post(
+ "https://v1.picoapi.com/api/text/sentiment",
+ headers=headers,
+ data=json.dumps(js),
+ )
+ jsout = rv.json()
+ except:
+ jsout = {} # borked sentiment analysis?
+
+ if "results" in jsout and len(jsout["results"]) > 0:
+ for doc in jsout["results"]:
+ mood = {
+ "negative": doc["negativity"],
+ "positive": doc["positivity"],
+ "neutral": doc["neutrality"],
+ }
+
+ # Sentiment is the overall score, and we use that for the neutrality of a text
+
+ # Additional (optional) emotion weighting
+ if "emotions" in doc:
+ for k, v in doc["emotions"].items():
+ mood[k] = v / 100 # Value is between 0 and 100.
+
+ moods[int(doc["id"])] = mood # Replace moods[X] with the actual mood
+
+ else:
+ kibble_bit.pprint("Failed to analyze email body.")
+ print(jsout)
+ # 403 returned on invalid key, 429 on rate exceeded.
+ # If we see a code return, let's just stop for now.
+ # Later scans can pick up the slack.
+ if "code" in jsout:
+ kibble_bit.pprint("Possible rate limiting in place, stopping for now.")
+ return False
+ return moods
diff --git a/kibble/settings.py b/kibble/settings.py
index d165bb2..b9c4003 100644
--- a/kibble/settings.py
+++ b/kibble/settings.py
@@ -17,6 +17,8 @@
import os
+from kibble.configuration import conf
+
YAML_DIRECTORY = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "api", "yaml"
)
@@ -24,3 +26,7 @@ KIBBLE_YAML = os.path.join(YAML_DIRECTORY, "kibble.yaml")
MAPPING_DIRECTORY = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "mappings"
)
+
+WATSON_ENABLED = bool(conf.get("watson", "username", fallback=None))
+AZURE_ENABLED = bool(conf.get("azure", "apikey", fallback=None))
+PICOAPI_ENABLED = bool(conf.get("picoapi", "key", fallback=None))