You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/09/09 07:49:35 UTC
[incubator-ponymail-foal] 01/02: fix unflow,
set a default charset as a constant at top.
This is an automated email from the ASF dual-hosted git repository.
humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
commit a950b55c03d07f20d8e62d303d4d3ba3297d1341
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Wed Sep 9 09:48:20 2020 +0200
fix unflow, set a default charset as a constant at top.
---
tools/archiver.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/tools/archiver.py b/tools/archiver.py
index c92d3c5..69dc4c1 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -62,6 +62,10 @@ import plugins.generators
import plugins.elastic
import elasticsearch
+# This is what we will default to if we are presented with emails without character sets and US-ASCII doesn't work.
+# UTF-8 is a superset encompassing all of US-ASCII, so should be safe to use and produce the most reliable results.
+DEFAULT_CHARACTER_SET = 'utf-8'
+
# Fetch config from same dir as archiver.py
config = plugins.ponymailconfig.PonymailConfig()
@@ -204,7 +208,7 @@ class Body:
# This is mainly so the older generators won't barf, as the generator will
# be fed the message body as a bytes object if no encoding is set, while
# the resulting metadoc will always use the string version.
- self.string = self.bytes.decode("utf-8", "replace")
+ self.string = self.bytes.decode(DEFAULT_CHARACTER_SET, "replace")
def __repr__(self):
return self.string
@@ -215,7 +219,7 @@ class Body:
def assign(self, new_string):
self.string = new_string
- def encode(self, encoding="utf-8", errors="strict"):
+ def encode(self, encoding=DEFAULT_CHARACTER_SET, errors="strict"):
return self.string.encode(encoding=encoding, errors=errors)
def unflow(self, convert_lf=False):
@@ -226,6 +230,8 @@ class Body:
"""
if self.string:
if self.flowed:
+ # Use provider character set or fall back to our sane default.
+ character_set = self.character_set or DEFAULT_CHARACTER_SET
# Convert lone LF to CRLF if found
if convert_lf:
fixed_string = "\r\n".join(
@@ -235,9 +241,9 @@ class Body:
else:
fixed_string = self.string
flow_fixed = formatflowed.convertToWrapped(
- fixed_string.encode(self.character_set, errors="ignore"),
+ fixed_string.encode(character_set, errors="ignore"),
wrap_fixed=False,
- character_set=self.character_set,
+ character_set=character_set,
)
# If we "upconverted" from LF to CRLF, convert back after flow decoding
if convert_lf and conversion_was_needed:
Re: [incubator-ponymail-foal] 01/02: fix unflow, set a default
charset as a constant at top.
Posted by sebb <se...@gmail.com>.
On Wed, 9 Sep 2020 at 08:49, <hu...@apache.org> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
>
> commit a950b55c03d07f20d8e62d303d4d3ba3297d1341
> Author: Daniel Gruno <hu...@apache.org>
> AuthorDate: Wed Sep 9 09:48:20 2020 +0200
>
> fix unflow, set a default charset as a constant at top.
> ---
> tools/archiver.py | 14 ++++++++++----
> 1 file changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index c92d3c5..69dc4c1 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -62,6 +62,10 @@ import plugins.generators
> import plugins.elastic
> import elasticsearch
>
> +# This is what we will default to if we are presented with emails without character sets and US-ASCII doesn't work.
> +# UTF-8 is a superset encompassing all of US-ASCII, so should be safe to use and produce the most reliable results.
If only.
It seems to me that mails with missing charsets that have non-ASCII
characters are generally going to be older mails, probably from before
the introduction of UTF-8.
It is not obvious to me that using UTF-8 will have the best outcome here.
I'm fine with UTF-8 as a default, but it's wrong to claim that it
works best or most reliably.
The most appropriate charset will depend on the corpus.
> +DEFAULT_CHARACTER_SET = 'utf-8'
> +
> # Fetch config from same dir as archiver.py
> config = plugins.ponymailconfig.PonymailConfig()
>
> @@ -204,7 +208,7 @@ class Body:
> # This is mainly so the older generators won't barf, as the generator will
> # be fed the message body as a bytes object if no encoding is set, while
> # the resulting metadoc will always use the string version.
> - self.string = self.bytes.decode("utf-8", "replace")
> + self.string = self.bytes.decode(DEFAULT_CHARACTER_SET, "replace")
>
> def __repr__(self):
> return self.string
> @@ -215,7 +219,7 @@ class Body:
> def assign(self, new_string):
> self.string = new_string
>
> - def encode(self, encoding="utf-8", errors="strict"):
> + def encode(self, encoding=DEFAULT_CHARACTER_SET, errors="strict"):
> return self.string.encode(encoding=encoding, errors=errors)
>
> def unflow(self, convert_lf=False):
> @@ -226,6 +230,8 @@ class Body:
> """
> if self.string:
> if self.flowed:
> + # Use provider character set or fall back to our sane default.
> + character_set = self.character_set or DEFAULT_CHARACTER_SET
> # Convert lone LF to CRLF if found
> if convert_lf:
> fixed_string = "\r\n".join(
> @@ -235,9 +241,9 @@ class Body:
> else:
> fixed_string = self.string
> flow_fixed = formatflowed.convertToWrapped(
> - fixed_string.encode(self.character_set, errors="ignore"),
> + fixed_string.encode(character_set, errors="ignore"),
> wrap_fixed=False,
> - character_set=self.character_set,
> + character_set=character_set,
> )
> # If we "upconverted" from LF to CRLF, convert back after flow decoding
> if convert_lf and conversion_was_needed:
>