You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/08/17 23:46:45 UTC

[incubator-ponymail-foal] branch master updated: Make a Body class for message bodies, add unflowing into it

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git


The following commit(s) were added to refs/heads/master by this push:
     new 68af505  Make a Body class for message bodies, add unflowing into it
68af505 is described below

commit 68af50513c5c81fe066638539cd3ab12f4fc0572
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Tue Aug 18 01:43:55 2020 +0200

    Make a Body class for message bodies, add unflowing into it
    
    This should make it easier to work with message bodies.
    The Body class has standard string properties, but also adds .unflow and
    metadata. This allows us to unflow later, when archiving, so as to not
    disturb older generators.
---
 tools/archiver.py           | 135 ++++++++++++++++++++++----------------------
 tools/plugins/generators.py |   8 +--
 2 files changed, 70 insertions(+), 73 deletions(-)

diff --git a/tools/archiver.py b/tools/archiver.py
index 93fd668..ecea544 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -126,19 +126,6 @@ def parse_attachment(
     return None, None
 
 
-def pm_charsets(msg: email.message.Message) -> typing.Set[str]:
-    """
-    Figures out and returns all character sets for a message or message part
-    :param msg: The email or message part to analyze
-    :return: all found charsets
-    """
-    charsets = set({})
-    for c in msg.get_charsets():
-        if c is not None:
-            charsets.update([c])
-    return charsets
-
-
 def normalize_lid(lid: str) -> str:  # N.B. Also used by import-mbox.py
     """ Ensures that a List ID is in standard form, i.e. <a.b.c.d> """
     # If of format "list name" <foo.bar.baz>
@@ -174,6 +161,46 @@ def message_attachments(msg: email.message.Message) -> typing.Tuple[list, dict]:
     return attachments, contents
 
 
+class Body:
+    def __init__(self, part: email.message.MIMEPart):
+        self.content_type = part.get_content_type()
+        self.charsets = set([part.get_charset()])  # Part's charset
+        self.charsets.update(part.get_charsets())  # Parent charsets as fallback
+        self.character_set = "utf-8"
+        self.string = None
+        self.flowed = True if "format=flowed" in part.get("content-type", "") else False
+        contents = part.get_payload(decode=True)
+        if contents is not None:
+            for cs in self.charsets:
+                if cs:
+                    try:
+                        self.string = contents.decode(cs)
+                        self.character_set = cs
+                    except UnicodeDecodeError:
+                        pass
+            if not self.string:
+                self.string = contents.decode("utf-8", errors="replace")
+
+    def __str__(self):
+        return self.string or "None"
+
+    def __len__(self):
+        return len(self.string or "")
+
+    def encode(self, charset="utf-8", errors="strict"):
+        return self.string.encode(charset, errors=errors)
+
+    def unflow(self):
+        if self.string:
+            if self.flowed:
+                return formatflowed.convertToWrapped(
+                    self.string.encode(self.character_set, errors="ignore"),
+                    wrap_fixed=False,
+                    character_set=self.character_set,
+                )
+        return self.string
+
+
 class Archiver(object):  # N.B. Also used by import-mbox.py
     """The general archiver class. Compatible with MailMan3 archiver classes."""
 
@@ -216,9 +243,19 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
         self.cropout = config.get("debug", {}).get("cropout")
         if parse_html:
             import html2text
+
             self.html2text = html2text.html2text
 
-    def message_body(self, msg: email.message.Message, verbose=False, ignore_body=None):
+    def message_body(
+        self, msg: email.message.Message, verbose=False, ignore_body=None
+    ) -> Body:
+        """
+            Fetches the proper text body from an email as an archiver.Body object
+        :param msg: The email or part of it to examine for proper body
+        :param verbose: Verbose output while parsing
+        :param ignore_body: Optional bodies to ignore while parsing
+        :return: archiver.Body object
+        """
         body = None
         first_html = None
         for part in msg.walk():
@@ -230,73 +267,36 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
                 Note: cannot use break here because firstHTML is needed if len(body) <= 1
             """
             try:
-                if not body and part.get_content_type() == "text/plain":
-                    body = part.get_payload(decode=True)
-                if not body and part.get_content_type() == "text/enriched":
-                    body = part.get_payload(decode=True)
+                if not body and part.get_content_type() in [
+                    "text/plain",
+                    "text/enriched",
+                ]:
+                    body = Body(part)
                 elif (
                     self.html
                     and not first_html
                     and part.get_content_type() == "text/html"
                 ):
-                    first_html = part.get_payload(decode=True)
+                    first_html = Body(part)
             except Exception as err:
                 print(err)
 
         # this requires a GPL lib, user will have to install it themselves
         if first_html and (
-            not body
+            body is None
             or len(body) <= 1
             or (ignore_body and str(body).find(str(ignore_body)) != -1)
         ):
-            body = self.html2text(
-                first_html.decode("utf-8", "ignore")
-                if type(first_html) is bytes
-                else first_html
-            )
-
-        # See issue#463
-        # This code will try at most one charset
-        # If the decode fails, it will use utf-8
-        if body is not None:
-            for charset in pm_charsets(msg):
-                try:
-                    body = body.decode(charset) if type(body) is bytes else body
-                    # at this point body can no longer be bytes
-                except UnicodeDecodeError:
-                    body = (
-                        body.decode("utf-8", errors="replace")
-                        if type(body) is bytes
-                        else body
-                    )
-                    # at this point body can no longer be bytes
-
+            content_type = "text/html"
+            body = first_html
+            body.string = self.html2text(body.string)
         return body
 
     def format_flowed(self, body, msg_metadata):
-        try:
-            if (
-                msg_metadata.get("content-type")
-                and msg_metadata.get("content-type", "").find("format=flowed") != -1
-            ):
-                body = formatflowed.convertToWrapped(
-                    bytes(body, "utf-8"), character_set="utf-8"
-                )
-            if isinstance(body, str):
-                body = body.encode("utf-8")
-        except UnicodeEncodeError:
-            try:
-                body = body.decode(chardet.detect(body)["encoding"])
-            except UnicodeDecodeError:
-                try:
-                    body = body.decode("latin-1")
-                except UnicodeDecodeError:
-                    try:
-                        if isinstance(body, str):
-                            body = body.encode("utf-8")
-                    except UnicodeEncodeError:
-                        body = None
-        return body
+        if body and body.flowed:
+            return formatflowed.decode(body.encode("utf-8"))
+        else:
+            return body
 
     # N.B. this is also called by import-mbox.py
     def compute_updates(
@@ -376,7 +376,6 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
         # message_date calculations are all done, prepare the index entry
         date_as_string = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
         body = self.message_body(msg, verbose=args.verbose, ignore_body=args.ibody)
-        body = self.format_flowed(body, msg_metadata)
 
         attachments, contents = message_attachments(msg)
         irt = ""
@@ -425,9 +424,7 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
                 "private": private,
                 "references": msg_metadata["references"],
                 "in-reply-to": irt,
-                "body": body.decode("utf-8", "replace")
-                if type(body) is bytes
-                else body,
+                "body": body.unflow(),
                 "attachments": attachments,
             }
 
diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
index 72e768c..949a3b6 100644
--- a/tools/plugins/generators.py
+++ b/tools/plugins/generators.py
@@ -202,7 +202,7 @@ def medium(msg, body, lid, _attachments, _raw_msg):
     """
 
     # Use text body
-    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+    xbody = body.encode('utf-8', 'ignore')
     # Use List ID
     xbody += bytes(lid, encoding='ascii')
     # Use Date header
@@ -250,7 +250,7 @@ def medium_original(msg, body, lid, _attachments, _raw_msg):
     """
 
     # Use text body
-    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+    xbody = body.encode('utf-8', 'ignore')
     # Use List ID
     xbody += lid  # WRONG: Should be: bytes(lid, 'ascii')
 
@@ -305,7 +305,7 @@ def cluster(msg, body, lid, attachments, _raw_msg):
     # Use text body
     if not body:  # Make sure body is not None, which will fail.
         body = ""
-    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+    xbody = body.encode('utf-8', 'ignore')
 
     # Crop out any trailing whitespace in body
     xbody = re.sub(b"\s+$", b"", xbody)
@@ -370,7 +370,7 @@ def legacy(msg, body, lid, _attachments, _raw_msg):
     except:
         pass
     mid = "%s@%s@%s" % (
-    hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
+    hashlib.sha224(body if type(body) is bytes else body.encode('utf-8', 'ignore')).hexdigest(), uid_mdate, lid)
     return mid