You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/09/04 23:44:20 UTC

[incubator-ponymail-foal] 01/02: re-align with old pony for cluster generator and unit tests

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git

commit f314d5250999e2afb2ab5063d35afe7d1c1114fa
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Sat Sep 5 01:41:57 2020 +0200

    re-align with old pony for cluster generator and unit tests
    
    The general idea here is, if we find an email without a charset at all,
    and we detect non-ascii characters in it, we assume it must be UTF-8 and
    grab the raw bytes. We also convert it internally to a string for the
    Body class, but we don't set the Body class' character set to anything.
    This way, we keep the cluster generator happy by passing it bytes, while
    keeping the rest happy by having a string representation that can be
    unflowed. As DKIM does not use the msgbody itself, it won't be affected
    by this change.
---
 tools/archiver.py           | 53 ++++++++++++++++++++++++++++++---------------
 tools/plugins/generators.py |  7 +++---
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/tools/archiver.py b/tools/archiver.py
index cfa3c3a..82ad32c 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -143,9 +143,7 @@ def normalize_lid(lid: str) -> str:  # N.B. Also used by import-mbox.py
     # Belt-and-braces: remove possible extraneous chars
     lid = "<%s>" % lid.strip(" <>").replace("@", ".")
     # Replace invalid characters with underscores so as to not invalidate doc IDs.
-    lid = re.sub(
-        r"[^-+~_<>.a-zA-Z0-9@]", "_", lid
-    )
+    lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
     # Finally, ensure we have a loosely valid list ID value
     if not re.match(r"^<.+\..+>$", lid):
         print("Invalid list-id %s" % lid)
@@ -172,24 +170,39 @@ def message_attachments(msg: email.message.Message) -> typing.Tuple[list, dict]:
 class Body:
     def __init__(self, part: email.message.Message):
         self.content_type = part.get_content_type()
-        self.charsets = set([part.get_content_charset()])  # Part's charset
-        self.charsets.update(
-            [part.get_charsets()[0]]
-        )  # Parent charset as fallback if any/different
-        self.character_set = "us-ascii"
+        self.charsets = [part.get_content_charset()]  # Part's charset
+        parent_charset = part.get_charsets()[0]
+        if parent_charset and parent_charset != self.charsets[0]:
+            self.charsets.append(
+                parent_charset
+            )  # Parent charset as fallback if any/different
+        self.character_set = None
+        self.has_charset = False
         self.string: typing.Optional[str] = None
         self.flowed = "format=flowed" in part.get("content-type", "")
-        contents = part.get_payload(decode=True)
-        if contents is not None:
-            for cs in self.charsets:
-                if cs:
+        self.bytes = part.get_payload(decode=True)
+        if self.bytes is not None:
+            valid_encodings = [x for x in self.charsets if x]
+            if valid_encodings:
+                for cs in valid_encodings:
                     try:
-                        self.string = contents.decode(cs)
+                        self.string = self.bytes.decode(cs)
                         self.character_set = str(cs)
+                        self.has_charset = True
+                        break
                     except UnicodeDecodeError:
                         pass
             if not self.string:
-                self.string = contents.decode("us-ascii", errors="replace")
+                self.string = self.bytes.decode("us-ascii", errors="replace")
+                if valid_encodings:
+                    self.character_set = "us-ascii"
+                # If no character encoding, but we find non-ASCII chars, assume bytes were UTF-8
+                elif len(self.bytes) != len(self.bytes.decode("us-ascii", "ignore")):
+                    part.set_charset("utf-8")
+                    self.bytes = part.get_payload(decode=True)
+                    # Set the .string, but not a character set, as we don't know it for sure.
+                    # This is mainly so the older generators won't barf.
+                    self.string = self.bytes.decode("utf-8", "replace")
 
     def __repr__(self):
         return self.string
@@ -200,8 +213,8 @@ class Body:
     def assign(self, new_string):
         self.string = new_string
 
-    def encode(self, charset="utf-8", errors="strict"):
-        return self.string.encode(charset, errors=errors)
+    def encode(self, encoding="utf-8", errors="strict"):
+        return self.string.encode(encoding=encoding, errors=errors)
 
     def unflow(self, convert_lf=False):
         """Unflows text of type format=flowed.
@@ -405,7 +418,12 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
                 if generator:
                     try:
                         mid = plugins.generators.generate(
-                            generator, msg, body, lid, attachments, raw_msg
+                            generator,
+                            msg,
+                            body if body.character_set else body.bytes,
+                            lid,
+                            attachments,
+                            raw_msg,
                         )
                     except Exception as err:
                         if logger:
@@ -431,6 +449,7 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
                     irt = ""
             all_mids = list(id_set)  # Convert to list
             document_id = all_mids[0]
+
             output_json = {
                 "from_raw": msg_metadata["from"],
                 "from": msg_metadata["from"],
diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
index 122633d..79ae9c9 100644
--- a/tools/plugins/generators.py
+++ b/tools/plugins/generators.py
@@ -234,6 +234,8 @@ def medium(msg, body, lid, _attachments, _raw_msg):
 # as the archived-at may change from node to node (and will change if not in the raw mbox file)
 # Also the lid is not included in the hash, so the hash does not change if the lid is overridden
 #
+
+
 def cluster(msg, body, lid, attachments, _raw_msg):
     """
     Use data that is guaranteed to be the same across cluster setups
@@ -268,16 +270,15 @@ def cluster(msg, body, lid, attachments, _raw_msg):
     # Use text body
     if not body:  # Make sure body is not None, which will fail.
         body = ""
-    xbody = body.encode('utf-8', 'ignore')
+    xbody = body if type(body) is bytes else body.encode('utf-8', errors='ignore')
 
     # Crop out any trailing whitespace in body
     xbody = re.sub(b"\s+$", b"", xbody)
 
     # Use Message-Id (or '' if missing)
-    xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
+    xbody += bytes(msg.get('message-id', ''), encoding='ascii')
 
     # Use Date header. Don't use archived-at, as the archiver sets this if not present.
-    mdate = None
     mdatestring = "(null)"  # Default to null, ONLY changed if replicable across imports
     try:
         mdate = email.utils.parsedate_tz(msg.get('date'))

Re: [incubator-ponymail-foal] 01/02: re-align with old pony for cluster generator and unit tests

Posted by sebb <se...@gmail.com>.

On Sat, 5 Sep 2020 at 00:44, <hu...@apache.org> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
>
> commit f314d5250999e2afb2ab5063d35afe7d1c1114fa
> Author: Daniel Gruno <hu...@apache.org>
> AuthorDate: Sat Sep 5 01:41:57 2020 +0200
>
>     re-align with old pony for cluster generator and unit tests
>
>     The general idea here is, if we find an email without a charset at all,
>     and we detect non-ascii characters in it, we assume it must be UTF-8 and
>     grab the raw bytes. We also convert it internally to a string for the
>     Body class, but we don't set the Body class' character set to anything.
>     This way, we keep the cluster generator happy by passing it bytes, while
>     keeping the rest happy by having a string representation that can be
>     unflowed. As DKIM does not use the msgbody itself, it won't be affected
>     by this change.

This information belongs in the code.

> ---
>  tools/archiver.py           | 53 ++++++++++++++++++++++++++++++---------------
>  tools/plugins/generators.py |  7 +++---
>  2 files changed, 40 insertions(+), 20 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index cfa3c3a..82ad32c 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -143,9 +143,7 @@ def normalize_lid(lid: str) -> str:  # N.B. Also used by import-mbox.py
>      # Belt-and-braces: remove possible extraneous chars
>      lid = "<%s>" % lid.strip(" <>").replace("@", ".")
>      # Replace invalid characters with underscores so as to not invalidate doc IDs.
> -    lid = re.sub(
> -        r"[^-+~_<>.a-zA-Z0-9@]", "_", lid
> -    )
> +    lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
>      # Finally, ensure we have a loosely valid list ID value
>      if not re.match(r"^<.+\..+>$", lid):
>          print("Invalid list-id %s" % lid)
> @@ -172,24 +170,39 @@ def message_attachments(msg: email.message.Message) -> typing.Tuple[list, dict]:
>  class Body:
>      def __init__(self, part: email.message.Message):
>          self.content_type = part.get_content_type()
> -        self.charsets = set([part.get_content_charset()])  # Part's charset
> -        self.charsets.update(
> -            [part.get_charsets()[0]]
> -        )  # Parent charset as fallback if any/different
> -        self.character_set = "us-ascii"
> +        self.charsets = [part.get_content_charset()]  # Part's charset
> +        parent_charset = part.get_charsets()[0]
> +        if parent_charset and parent_charset != self.charsets[0]:
> +            self.charsets.append(
> +                parent_charset
> +            )  # Parent charset as fallback if any/different
> +        self.character_set = None
> +        self.has_charset = False
>          self.string: typing.Optional[str] = None
>          self.flowed = "format=flowed" in part.get("content-type", "")
> -        contents = part.get_payload(decode=True)
> -        if contents is not None:
> -            for cs in self.charsets:
> -                if cs:
> +        self.bytes = part.get_payload(decode=True)
> +        if self.bytes is not None:
> +            valid_encodings = [x for x in self.charsets if x]
> +            if valid_encodings:
> +                for cs in valid_encodings:
>                      try:
> -                        self.string = contents.decode(cs)
> +                        self.string = self.bytes.decode(cs)
>                          self.character_set = str(cs)
> +                        self.has_charset = True
> +                        break
>                      except UnicodeDecodeError:
>                          pass
>              if not self.string:
> -                self.string = contents.decode("us-ascii", errors="replace")
> +                self.string = self.bytes.decode("us-ascii", errors="replace")
> +                if valid_encodings:
> +                    self.character_set = "us-ascii"
> +                # If no character encoding, but we find non-ASCII chars, assume bytes were UTF-8
> +                elif len(self.bytes) != len(self.bytes.decode("us-ascii", "ignore")):
> +                    part.set_charset("utf-8")
> +                    self.bytes = part.get_payload(decode=True)
> +                    # Set the .string, but not a character set, as we don't know it for sure.
> +                    # This is mainly so the older generators won't barf.
> +                    self.string = self.bytes.decode("utf-8", "replace")
>
>      def __repr__(self):
>          return self.string
> @@ -200,8 +213,8 @@ class Body:
>      def assign(self, new_string):
>          self.string = new_string
>
> -    def encode(self, charset="utf-8", errors="strict"):
> -        return self.string.encode(charset, errors=errors)
> +    def encode(self, encoding="utf-8", errors="strict"):
> +        return self.string.encode(encoding=encoding, errors=errors)
>
>      def unflow(self, convert_lf=False):
>          """Unflows text of type format=flowed.
> @@ -405,7 +418,12 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
>                  if generator:
>                      try:
>                          mid = plugins.generators.generate(
> -                            generator, msg, body, lid, attachments, raw_msg
> +                            generator,
> +                            msg,
> +                            body if body.character_set else body.bytes,
> +                            lid,
> +                            attachments,
> +                            raw_msg,
>                          )
>                      except Exception as err:
>                          if logger:
> @@ -431,6 +449,7 @@ class Archiver(object):  # N.B. Also used by import-mbox.py
>                      irt = ""
>              all_mids = list(id_set)  # Convert to list
>              document_id = all_mids[0]
> +
>              output_json = {
>                  "from_raw": msg_metadata["from"],
>                  "from": msg_metadata["from"],
> diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
> index 122633d..79ae9c9 100644
> --- a/tools/plugins/generators.py
> +++ b/tools/plugins/generators.py
> @@ -234,6 +234,8 @@ def medium(msg, body, lid, _attachments, _raw_msg):
>  # as the archived-at may change from node to node (and will change if not in the raw mbox file)
>  # Also the lid is not included in the hash, so the hash does not change if the lid is overridden
>  #
> +
> +
>  def cluster(msg, body, lid, attachments, _raw_msg):
>      """
>      Use data that is guaranteed to be the same across cluster setups
> @@ -268,16 +270,15 @@ def cluster(msg, body, lid, attachments, _raw_msg):
>      # Use text body
>      if not body:  # Make sure body is not None, which will fail.
>          body = ""
> -    xbody = body.encode('utf-8', 'ignore')
> +    xbody = body if type(body) is bytes else body.encode('utf-8', errors='ignore')
>
>      # Crop out any trailing whitespace in body
>      xbody = re.sub(b"\s+$", b"", xbody)
>
>      # Use Message-Id (or '' if missing)
> -    xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
> +    xbody += bytes(msg.get('message-id', ''), encoding='ascii')
>
>      # Use Date header. Don't use archived-at, as the archiver sets this if not present.
> -    mdate = None
>      mdatestring = "(null)"  # Default to null, ONLY changed if replicable across imports
>      try:
>          mdate = email.utils.parsedate_tz(msg.get('date'))
>