You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kibble.apache.org by hu...@apache.org on 2018/01/09 00:48:51 UTC
[kibble-scanners] branch master updated: Better trimming of
unnecessary text elements
This is an automated email from the ASF dual-hosted git repository.
humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git
The following commit(s) were added to refs/heads/master by this push:
new 9e3ecdf Better trimming of unnecessary text elements
9e3ecdf is described below
commit 9e3ecdf73602892ec220b32f109b9ee0af1f368f
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Tue Jan 9 01:48:41 2018 +0100
Better trimming of unnecessary text elements
We don't want to be analysing:
- quotes
- "on $date, bla bla wrote" sort of sentences
- URLs, email addresses
---
src/plugins/utils/kpe.py | 23 +++++++++++++++++++----
1 file changed, 19 insertions(+), 4 deletions(-)
diff --git a/src/plugins/utils/kpe.py b/src/plugins/utils/kpe.py
index 7985d3e..0390100 100644
--- a/src/plugins/utils/kpe.py
+++ b/src/plugins/utils/kpe.py
@@ -42,6 +42,22 @@ import requests
import json
import uuid
+def trimBody(body):
+ """ Quick function for trimming away the fat from emails """
+ # Cut away "On $date, jane doe wrote: " kind of texts
+ body = re.sub(r"((?:\r?\n)((on .+ wrote:[\r\n]+)|(sent from my .+)|(>+[ \t]+[^\r\n]*\r?\n[^\n]*\n*)+)+)+", "", body, flags = re.I | re.M)
+
+ # Crop out quotes
+ lines = body.split("\n")
+ body = "\n".join([x for x in lines if not x.startswith(">")])
+
+ # Remove hyperlinks
+ body = re.sub(r"[a-z]+://\S+", "", body)
+
+ # Remove email addresses
+ body = re.sub(r"(<[^>]+>\s*\S+@\S+)", "", body)
+ body = re.sub(r"(\S+@\S+)", "", body)
+ return body
def azureKPE(KibbleBit, bodies):
""" KPE using Azure Text Analysis API """
@@ -62,7 +78,7 @@ def azureKPE(KibbleBit, bodies):
for body in bodies:
# Crop out quotes
lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
+ body = trimBody(body)
doc = {
"language": "en",
"id": str(a),
@@ -113,9 +129,8 @@ def picoKPE(KibbleBit, bodies):
a = 0
KPEs = []
for body in bodies:
- # Crop out quotes
- lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
+ body = trimBody(body)
+
doc = {
"id": str(a),
"body": body
--
To stop receiving notification emails like this one, please contact
['"commits@kibble.apache.org" <co...@kibble.apache.org>'].