You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/08/17 23:46:45 UTC
[incubator-ponymail-foal] branch master updated: Make a Body class
for message bodies, add unflowing into it
This is an automated email from the ASF dual-hosted git repository.
humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
The following commit(s) were added to refs/heads/master by this push:
new 68af505 Make a Body class for message bodies, add unflowing into it
68af505 is described below
commit 68af50513c5c81fe066638539cd3ab12f4fc0572
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Tue Aug 18 01:43:55 2020 +0200
Make a Body class for message bodies, add unflowing into it
This should make it easier to work with message bodies.
The Body class has standard string properties, but also adds .unflow and
metadata. This allows us to unflow later, when archiving, so as to not
disturb older generators.
---
tools/archiver.py | 135 ++++++++++++++++++++++----------------------
tools/plugins/generators.py | 8 +--
2 files changed, 70 insertions(+), 73 deletions(-)
diff --git a/tools/archiver.py b/tools/archiver.py
index 93fd668..ecea544 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -126,19 +126,6 @@ def parse_attachment(
return None, None
-def pm_charsets(msg: email.message.Message) -> typing.Set[str]:
- """
- Figures out and returns all character sets for a message or message part
- :param msg: The email or message part to analyze
- :return: all found charsets
- """
- charsets = set({})
- for c in msg.get_charsets():
- if c is not None:
- charsets.update([c])
- return charsets
-
-
def normalize_lid(lid: str) -> str: # N.B. Also used by import-mbox.py
""" Ensures that a List ID is in standard form, i.e. <a.b.c.d> """
# If of format "list name" <foo.bar.baz>
@@ -174,6 +161,46 @@ def message_attachments(msg: email.message.Message) -> typing.Tuple[list, dict]:
return attachments, contents
+class Body:
+ def __init__(self, part: email.message.MIMEPart):
+ self.content_type = part.get_content_type()
+ self.charsets = set([part.get_charset()]) # Part's charset
+ self.charsets.update(part.get_charsets()) # Parent charsets as fallback
+ self.character_set = "utf-8"
+ self.string = None
+ self.flowed = True if "format=flowed" in part.get("content-type", "") else False
+ contents = part.get_payload(decode=True)
+ if contents is not None:
+ for cs in self.charsets:
+ if cs:
+ try:
+ self.string = contents.decode(cs)
+ self.character_set = cs
+ except UnicodeDecodeError:
+ pass
+ if not self.string:
+ self.string = contents.decode("utf-8", errors="replace")
+
+ def __str__(self):
+ return self.string or "None"
+
+ def __len__(self):
+ return len(self.string or "")
+
+ def encode(self, charset="utf-8", errors="strict"):
+ return self.string.encode(charset, errors=errors)
+
+ def unflow(self):
+ if self.string:
+ if self.flowed:
+ return formatflowed.convertToWrapped(
+ self.string.encode(self.character_set, errors="ignore"),
+ wrap_fixed=False,
+ character_set=self.character_set,
+ )
+ return self.string
+
+
class Archiver(object): # N.B. Also used by import-mbox.py
"""The general archiver class. Compatible with MailMan3 archiver classes."""
@@ -216,9 +243,19 @@ class Archiver(object): # N.B. Also used by import-mbox.py
self.cropout = config.get("debug", {}).get("cropout")
if parse_html:
import html2text
+
self.html2text = html2text.html2text
- def message_body(self, msg: email.message.Message, verbose=False, ignore_body=None):
+ def message_body(
+ self, msg: email.message.Message, verbose=False, ignore_body=None
+ ) -> Body:
+ """
+ Fetches the proper text body from an email as an archiver.Body object
+ :param msg: The email or part of it to examine for proper body
+ :param verbose: Verbose output while parsing
+ :param ignore_body: Optional bodies to ignore while parsing
+ :return: archiver.Body object
+ """
body = None
first_html = None
for part in msg.walk():
@@ -230,73 +267,36 @@ class Archiver(object): # N.B. Also used by import-mbox.py
Note: cannot use break here because firstHTML is needed if len(body) <= 1
"""
try:
- if not body and part.get_content_type() == "text/plain":
- body = part.get_payload(decode=True)
- if not body and part.get_content_type() == "text/enriched":
- body = part.get_payload(decode=True)
+ if not body and part.get_content_type() in [
+ "text/plain",
+ "text/enriched",
+ ]:
+ body = Body(part)
elif (
self.html
and not first_html
and part.get_content_type() == "text/html"
):
- first_html = part.get_payload(decode=True)
+ first_html = Body(part)
except Exception as err:
print(err)
# this requires a GPL lib, user will have to install it themselves
if first_html and (
- not body
+ body is None
or len(body) <= 1
or (ignore_body and str(body).find(str(ignore_body)) != -1)
):
- body = self.html2text(
- first_html.decode("utf-8", "ignore")
- if type(first_html) is bytes
- else first_html
- )
-
- # See issue#463
- # This code will try at most one charset
- # If the decode fails, it will use utf-8
- if body is not None:
- for charset in pm_charsets(msg):
- try:
- body = body.decode(charset) if type(body) is bytes else body
- # at this point body can no longer be bytes
- except UnicodeDecodeError:
- body = (
- body.decode("utf-8", errors="replace")
- if type(body) is bytes
- else body
- )
- # at this point body can no longer be bytes
-
+ content_type = "text/html"
+ body = first_html
+ body.string = self.html2text(body.string)
return body
def format_flowed(self, body, msg_metadata):
- try:
- if (
- msg_metadata.get("content-type")
- and msg_metadata.get("content-type", "").find("format=flowed") != -1
- ):
- body = formatflowed.convertToWrapped(
- bytes(body, "utf-8"), character_set="utf-8"
- )
- if isinstance(body, str):
- body = body.encode("utf-8")
- except UnicodeEncodeError:
- try:
- body = body.decode(chardet.detect(body)["encoding"])
- except UnicodeDecodeError:
- try:
- body = body.decode("latin-1")
- except UnicodeDecodeError:
- try:
- if isinstance(body, str):
- body = body.encode("utf-8")
- except UnicodeEncodeError:
- body = None
- return body
+ if body and body.flowed:
+ return formatflowed.decode(body.encode("utf-8"))
+ else:
+ return body
# N.B. this is also called by import-mbox.py
def compute_updates(
@@ -376,7 +376,6 @@ class Archiver(object): # N.B. Also used by import-mbox.py
# message_date calculations are all done, prepare the index entry
date_as_string = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
body = self.message_body(msg, verbose=args.verbose, ignore_body=args.ibody)
- body = self.format_flowed(body, msg_metadata)
attachments, contents = message_attachments(msg)
irt = ""
@@ -425,9 +424,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
"private": private,
"references": msg_metadata["references"],
"in-reply-to": irt,
- "body": body.decode("utf-8", "replace")
- if type(body) is bytes
- else body,
+ "body": body.unflow(),
"attachments": attachments,
}
diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
index 72e768c..949a3b6 100644
--- a/tools/plugins/generators.py
+++ b/tools/plugins/generators.py
@@ -202,7 +202,7 @@ def medium(msg, body, lid, _attachments, _raw_msg):
"""
# Use text body
- xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+ xbody = body.encode('utf-8', 'ignore')
# Use List ID
xbody += bytes(lid, encoding='ascii')
# Use Date header
@@ -250,7 +250,7 @@ def medium_original(msg, body, lid, _attachments, _raw_msg):
"""
# Use text body
- xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+ xbody = body.encode('utf-8', 'ignore')
# Use List ID
xbody += lid # WRONG: Should be: bytes(lid, 'ascii')
@@ -305,7 +305,7 @@ def cluster(msg, body, lid, attachments, _raw_msg):
# Use text body
if not body: # Make sure body is not None, which will fail.
body = ""
- xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+ xbody = body.encode('utf-8', 'ignore')
# Crop out any trailing whitespace in body
xbody = re.sub(b"\s+$", b"", xbody)
@@ -370,7 +370,7 @@ def legacy(msg, body, lid, _attachments, _raw_msg):
except:
pass
mid = "%s@%s@%s" % (
- hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
+ hashlib.sha224(body if type(body) is bytes else body.encode('utf-8', 'ignore')).hexdigest(), uid_mdate, lid)
return mid