You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/09/04 23:44:20 UTC
[incubator-ponymail-foal] 01/02: re-align with old pony for cluster
generator and unit tests
This is an automated email from the ASF dual-hosted git repository.
humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
commit f314d5250999e2afb2ab5063d35afe7d1c1114fa
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Sat Sep 5 01:41:57 2020 +0200
re-align with old pony for cluster generator and unit tests
The general idea here is, if we find an email without a charset at all,
and we detect non-ascii characters in it, we assume it must be UTF-8 and
grab the raw bytes. We also convert it internally to a string for the
Body class, but we don't set the Body class' character set to anything.
This way, we keep the cluster generator happy by passing it bytes, while
keeping the rest happy by having a string representation that can be
unflowed. As DKIM does not use the msgbody itself, it won't be affected
by this change.
---
tools/archiver.py | 53 ++++++++++++++++++++++++++++++---------------
tools/plugins/generators.py | 7 +++---
2 files changed, 40 insertions(+), 20 deletions(-)
diff --git a/tools/archiver.py b/tools/archiver.py
index cfa3c3a..82ad32c 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -143,9 +143,7 @@ def normalize_lid(lid: str) -> str: # N.B. Also used by import-mbox.py
# Belt-and-braces: remove possible extraneous chars
lid = "<%s>" % lid.strip(" <>").replace("@", ".")
# Replace invalid characters with underscores so as to not invalidate doc IDs.
- lid = re.sub(
- r"[^-+~_<>.a-zA-Z0-9@]", "_", lid
- )
+ lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
# Finally, ensure we have a loosely valid list ID value
if not re.match(r"^<.+\..+>$", lid):
print("Invalid list-id %s" % lid)
@@ -172,24 +170,39 @@ def message_attachments(msg: email.message.Message) -> typing.Tuple[list, dict]:
class Body:
def __init__(self, part: email.message.Message):
self.content_type = part.get_content_type()
- self.charsets = set([part.get_content_charset()]) # Part's charset
- self.charsets.update(
- [part.get_charsets()[0]]
- ) # Parent charset as fallback if any/different
- self.character_set = "us-ascii"
+ self.charsets = [part.get_content_charset()] # Part's charset
+ parent_charset = part.get_charsets()[0]
+ if parent_charset and parent_charset != self.charsets[0]:
+ self.charsets.append(
+ parent_charset
+ ) # Parent charset as fallback if any/different
+ self.character_set = None
+ self.has_charset = False
self.string: typing.Optional[str] = None
self.flowed = "format=flowed" in part.get("content-type", "")
- contents = part.get_payload(decode=True)
- if contents is not None:
- for cs in self.charsets:
- if cs:
+ self.bytes = part.get_payload(decode=True)
+ if self.bytes is not None:
+ valid_encodings = [x for x in self.charsets if x]
+ if valid_encodings:
+ for cs in valid_encodings:
try:
- self.string = contents.decode(cs)
+ self.string = self.bytes.decode(cs)
self.character_set = str(cs)
+ self.has_charset = True
+ break
except UnicodeDecodeError:
pass
if not self.string:
- self.string = contents.decode("us-ascii", errors="replace")
+ self.string = self.bytes.decode("us-ascii", errors="replace")
+ if valid_encodings:
+ self.character_set = "us-ascii"
+ # If no character encoding, but we find non-ASCII chars, assume bytes were UTF-8
+ elif len(self.bytes) != len(self.bytes.decode("us-ascii", "ignore")):
+ part.set_charset("utf-8")
+ self.bytes = part.get_payload(decode=True)
+ # Set the .string, but not a character set, as we don't know it for sure.
+ # This is mainly so the older generators won't barf.
+ self.string = self.bytes.decode("utf-8", "replace")
def __repr__(self):
return self.string
@@ -200,8 +213,8 @@ class Body:
def assign(self, new_string):
self.string = new_string
- def encode(self, charset="utf-8", errors="strict"):
- return self.string.encode(charset, errors=errors)
+ def encode(self, encoding="utf-8", errors="strict"):
+ return self.string.encode(encoding=encoding, errors=errors)
def unflow(self, convert_lf=False):
"""Unflows text of type format=flowed.
@@ -405,7 +418,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
if generator:
try:
mid = plugins.generators.generate(
- generator, msg, body, lid, attachments, raw_msg
+ generator,
+ msg,
+ body if body.character_set else body.bytes,
+ lid,
+ attachments,
+ raw_msg,
)
except Exception as err:
if logger:
@@ -431,6 +449,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
irt = ""
all_mids = list(id_set) # Convert to list
document_id = all_mids[0]
+
output_json = {
"from_raw": msg_metadata["from"],
"from": msg_metadata["from"],
diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
index 122633d..79ae9c9 100644
--- a/tools/plugins/generators.py
+++ b/tools/plugins/generators.py
@@ -234,6 +234,8 @@ def medium(msg, body, lid, _attachments, _raw_msg):
# as the archived-at may change from node to node (and will change if not in the raw mbox file)
# Also the lid is not included in the hash, so the hash does not change if the lid is overridden
#
+
+
def cluster(msg, body, lid, attachments, _raw_msg):
"""
Use data that is guaranteed to be the same across cluster setups
@@ -268,16 +270,15 @@ def cluster(msg, body, lid, attachments, _raw_msg):
# Use text body
if not body: # Make sure body is not None, which will fail.
body = ""
- xbody = body.encode('utf-8', 'ignore')
+ xbody = body if type(body) is bytes else body.encode('utf-8', errors='ignore')
# Crop out any trailing whitespace in body
xbody = re.sub(b"\s+$", b"", xbody)
# Use Message-Id (or '' if missing)
- xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
+ xbody += bytes(msg.get('message-id', ''), encoding='ascii')
# Use Date header. Don't use archived-at, as the archiver sets this if not present.
- mdate = None
mdatestring = "(null)" # Default to null, ONLY changed if replicable across imports
try:
mdate = email.utils.parsedate_tz(msg.get('date'))
Re: [incubator-ponymail-foal] 01/02: re-align with old pony for
cluster generator and unit tests
Posted by sebb <se...@gmail.com>.
On Sat, 5 Sep 2020 at 00:44, <hu...@apache.org> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
>
> commit f314d5250999e2afb2ab5063d35afe7d1c1114fa
> Author: Daniel Gruno <hu...@apache.org>
> AuthorDate: Sat Sep 5 01:41:57 2020 +0200
>
> re-align with old pony for cluster generator and unit tests
>
> The general idea here is, if we find an email without a charset at all,
> and we detect non-ascii characters in it, we assume it must be UTF-8 and
> grab the raw bytes. We also convert it internally to a string for the
> Body class, but we don't set the Body class' character set to anything.
> This way, we keep the cluster generator happy by passing it bytes, while
> keeping the rest happy by having a string representation that can be
> unflowed. As DKIM does not use the msgbody itself, it won't be affected
> by this change.
This information belongs in the code.
> ---
> tools/archiver.py | 53 ++++++++++++++++++++++++++++++---------------
> tools/plugins/generators.py | 7 +++---
> 2 files changed, 40 insertions(+), 20 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index cfa3c3a..82ad32c 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -143,9 +143,7 @@ def normalize_lid(lid: str) -> str: # N.B. Also used by import-mbox.py
> # Belt-and-braces: remove possible extraneous chars
> lid = "<%s>" % lid.strip(" <>").replace("@", ".")
> # Replace invalid characters with underscores so as to not invalidate doc IDs.
> - lid = re.sub(
> - r"[^-+~_<>.a-zA-Z0-9@]", "_", lid
> - )
> + lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
> # Finally, ensure we have a loosely valid list ID value
> if not re.match(r"^<.+\..+>$", lid):
> print("Invalid list-id %s" % lid)
> @@ -172,24 +170,39 @@ def message_attachments(msg: email.message.Message) -> typing.Tuple[list, dict]:
> class Body:
> def __init__(self, part: email.message.Message):
> self.content_type = part.get_content_type()
> - self.charsets = set([part.get_content_charset()]) # Part's charset
> - self.charsets.update(
> - [part.get_charsets()[0]]
> - ) # Parent charset as fallback if any/different
> - self.character_set = "us-ascii"
> + self.charsets = [part.get_content_charset()] # Part's charset
> + parent_charset = part.get_charsets()[0]
> + if parent_charset and parent_charset != self.charsets[0]:
> + self.charsets.append(
> + parent_charset
> + ) # Parent charset as fallback if any/different
> + self.character_set = None
> + self.has_charset = False
> self.string: typing.Optional[str] = None
> self.flowed = "format=flowed" in part.get("content-type", "")
> - contents = part.get_payload(decode=True)
> - if contents is not None:
> - for cs in self.charsets:
> - if cs:
> + self.bytes = part.get_payload(decode=True)
> + if self.bytes is not None:
> + valid_encodings = [x for x in self.charsets if x]
> + if valid_encodings:
> + for cs in valid_encodings:
> try:
> - self.string = contents.decode(cs)
> + self.string = self.bytes.decode(cs)
> self.character_set = str(cs)
> + self.has_charset = True
> + break
> except UnicodeDecodeError:
> pass
> if not self.string:
> - self.string = contents.decode("us-ascii", errors="replace")
> + self.string = self.bytes.decode("us-ascii", errors="replace")
> + if valid_encodings:
> + self.character_set = "us-ascii"
> + # If no character encoding, but we find non-ASCII chars, assume bytes were UTF-8
> + elif len(self.bytes) != len(self.bytes.decode("us-ascii", "ignore")):
> + part.set_charset("utf-8")
> + self.bytes = part.get_payload(decode=True)
> + # Set the .string, but not a character set, as we don't know it for sure.
> + # This is mainly so the older generators won't barf.
> + self.string = self.bytes.decode("utf-8", "replace")
>
> def __repr__(self):
> return self.string
> @@ -200,8 +213,8 @@ class Body:
> def assign(self, new_string):
> self.string = new_string
>
> - def encode(self, charset="utf-8", errors="strict"):
> - return self.string.encode(charset, errors=errors)
> + def encode(self, encoding="utf-8", errors="strict"):
> + return self.string.encode(encoding=encoding, errors=errors)
>
> def unflow(self, convert_lf=False):
> """Unflows text of type format=flowed.
> @@ -405,7 +418,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> if generator:
> try:
> mid = plugins.generators.generate(
> - generator, msg, body, lid, attachments, raw_msg
> + generator,
> + msg,
> + body if body.character_set else body.bytes,
> + lid,
> + attachments,
> + raw_msg,
> )
> except Exception as err:
> if logger:
> @@ -431,6 +449,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> irt = ""
> all_mids = list(id_set) # Convert to list
> document_id = all_mids[0]
> +
> output_json = {
> "from_raw": msg_metadata["from"],
> "from": msg_metadata["from"],
> diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
> index 122633d..79ae9c9 100644
> --- a/tools/plugins/generators.py
> +++ b/tools/plugins/generators.py
> @@ -234,6 +234,8 @@ def medium(msg, body, lid, _attachments, _raw_msg):
> # as the archived-at may change from node to node (and will change if not in the raw mbox file)
> # Also the lid is not included in the hash, so the hash does not change if the lid is overridden
> #
> +
> +
> def cluster(msg, body, lid, attachments, _raw_msg):
> """
> Use data that is guaranteed to be the same across cluster setups
> @@ -268,16 +270,15 @@ def cluster(msg, body, lid, attachments, _raw_msg):
> # Use text body
> if not body: # Make sure body is not None, which will fail.
> body = ""
> - xbody = body.encode('utf-8', 'ignore')
> + xbody = body if type(body) is bytes else body.encode('utf-8', errors='ignore')
>
> # Crop out any trailing whitespace in body
> xbody = re.sub(b"\s+$", b"", xbody)
>
> # Use Message-Id (or '' if missing)
> - xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
> + xbody += bytes(msg.get('message-id', ''), encoding='ascii')
>
> # Use Date header. Don't use archived-at, as the archiver sets this if not present.
> - mdate = None
> mdatestring = "(null)" # Default to null, ONLY changed if replicable across imports
> try:
> mdate = email.utils.parsedate_tz(msg.get('date'))
>