You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/09/06 07:09:34 UTC

[incubator-ponymail-foal] 02/02: Allow storing of html-only emails even if html2text is not enabled

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git

commit d653fc5bd65d81e752507ca93b07429ec7185791
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Sun Sep 6 09:06:11 2020 +0200

    Allow storing of html-only emails even if html2text is not enabled
    
    This means storing the raw html source and setting a flag for the unit
    test so they won't break in these cases where foal and pony differ.
---
 tools/archiver.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tools/archiver.py b/tools/archiver.py
index c52207b..9222a66 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -181,6 +181,7 @@ class Body:
         self.string: typing.Optional[str] = None
         self.flowed = "format=flowed" in part.get("content-type", "")
         self.bytes = part.get_payload(decode=True)
+        self.html_as_source = False
         if self.bytes is not None:
             valid_encodings = [x for x in self.charsets if x]
             if valid_encodings:
@@ -312,8 +313,7 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
                 ]:
                     body = Body(part)
                 elif (
-                    self.html
-                    and not first_html
+                    not first_html
                     and part.get_content_type() == "text/html"
                 ):
                     first_html = Body(part)
@@ -327,7 +327,12 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
             or (self.ignore_body and str(body).find(str(self.ignore_body)) != -1)
         ):
             body = first_html
-            body.assign(self.html2text(str(body)))
+            body.html_as_source = True
+
+            # Convert HTML to text if mod is installed and enabled, otherwise keep the source as-is
+            if self.html:
+                body.assign(self.html2text(str(body)))
+                body.html_as_source = False
         return body
 
     # N.B. this is also called by import-mbox.py
@@ -415,13 +420,20 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
         if body is not None or attachments:
             pmid = mid
             id_set = set()  # Use a set to avoid duplicates
+            # The body used for generators differ from the body put into the meta doc,
+            # for historical reasons. In the older generators where it is actively used,
+            # it would be UTF-8 bytes in cases of charset-less message bodies. It would
+            # also be nothing in case of html-only emails where html2text is not enabled.
+            generator_body = body if body and body.character_set else body and body.bytes or ""
+            if body.html_as_source:
+                generator_body = ""
             for generator in self.generator.split(" "):
                 if generator:
                     try:
                         mid = plugins.generators.generate(
                             generator,
                             msg,
-                            body if body and body.character_set else body and body.bytes or "",
+                            generator_body,
                             lid,
                             attachments,
                             raw_msg,
@@ -469,6 +481,7 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
                 "references": msg_metadata["references"],
                 "in-reply-to": irt,
                 "body": body.unflow() if body else "",
+                "html_source_only": body.html_as_source,
                 "attachments": attachments,
             }