You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kibble.apache.org by hu...@apache.org on 2018/01/09 00:48:51 UTC

[kibble-scanners] branch master updated: Better trimming of unnecessary text elements

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kibble-scanners.git


The following commit(s) were added to refs/heads/master by this push:
     new 9e3ecdf  Better trimming of unnecessary text elements
9e3ecdf is described below

commit 9e3ecdf73602892ec220b32f109b9ee0af1f368f
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Tue Jan 9 01:48:41 2018 +0100

    Better trimming of unnecessary text elements
    
    We don't want to be analysing:
    - quotes
    - "on $date, bla bla wrote" sort of sentences
    - URLs, email addresses
---
 src/plugins/utils/kpe.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/plugins/utils/kpe.py b/src/plugins/utils/kpe.py
index 7985d3e..0390100 100644
--- a/src/plugins/utils/kpe.py
+++ b/src/plugins/utils/kpe.py
@@ -42,6 +42,22 @@ import requests
 import json
 import uuid
 
+def trimBody(body):
+    """ Quick function for trimming away the fat from emails """
+    # Cut away "On $date, jane doe wrote: " kind of texts
+    body = re.sub(r"((?:\r?\n)((on .+ wrote:[\r\n]+)|(sent from my .+)|(>+[ \t]+[^\r\n]*\r?\n[^\n]*\n*)+)+)+", "", body, flags = re.I | re.M)
+    
+    # Crop out quotes
+    lines = body.split("\n")
+    body = "\n".join([x for x in lines if not x.startswith(">")])
+    
+    # Remove hyperlinks
+    body = re.sub(r"[a-z]+://\S+", "", body)
+    
+    # Remove email addresses
+    body = re.sub(r"(<[^>]+>\s*\S+@\S+)", "", body)
+    body = re.sub(r"(\S+@\S+)", "", body)
+    return body
 
 def azureKPE(KibbleBit, bodies):
     """ KPE using Azure Text Analysis API """
@@ -62,7 +78,7 @@ def azureKPE(KibbleBit, bodies):
         for body in bodies:
             # Crop out quotes
             lines = body.split("\n")
-            body = "\n".join([x for x in lines if not x.startswith(">")])
+            body = trimBody(body)
             doc = {
                 "language": "en",
                 "id": str(a),
@@ -113,9 +129,8 @@ def picoKPE(KibbleBit, bodies):
         a = 0
         KPEs = []
         for body in bodies:
-            # Crop out quotes
-            lines = body.split("\n")
-            body = "\n".join([x for x in lines if not x.startswith(">")])
+            body = trimBody(body)
+            
             doc = {
                 "id": str(a),
                 "body": body

-- 
To stop receiving notification emails like this one, please contact
['"commits@kibble.apache.org" <co...@kibble.apache.org>'].