You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by se...@apache.org on 2016/12/24 10:14:31 UTC

incubator-ponymail git commit: import-mbox.py fails to unescape >From lines

Repository: incubator-ponymail
Updated Branches:
  refs/heads/master 144e1b754 -> 1fc602425


import-mbox.py fails to unescape >From lines

This fixes #212


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/1fc60242
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/1fc60242
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/1fc60242

Branch: refs/heads/master
Commit: 1fc602425dc21ff8db1da2ca85e41cbf87652733
Parents: 144e1b7
Author: Sebb <se...@apache.org>
Authored: Sat Dec 24 10:13:42 2016 +0000
Committer: Sebb <se...@apache.org>
Committed: Sat Dec 24 10:13:42 2016 +0000

----------------------------------------------------------------------
 CHANGELOG.md         |  1 +
 tools/import-mbox.py |  7 ++--
 tools/mboxo_patch.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/1fc60242/CHANGELOG.md
----------------------------------------------------------------------
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 56d7705..b938955 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -91,6 +91,7 @@
 - elastic.get does not return if a document is not found but some callers overlook this (#137)
 - pcall() idiom to protect against elastic.lua exceptions is flawed (#162)
 - unhelpful error reporting for invalid Permalinks/Source links (#123)
+- import-mbox.py fails to unescape >From lines (#212)
 
 ## CHANGES in 0.9b:
 

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/1fc60242/tools/import-mbox.py
----------------------------------------------------------------------
diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 07727e1..6ca43a5 100755
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -35,6 +35,10 @@ import multiprocessing
 import tempfile
 import gzip
 
+# Temporary patch to fix Python email package limitation
+# It must be removed when the Python package is fixed
+from mboxo_patch import MboxoFactory
+
 try:
     from elasticsearch import Elasticsearch, helpers
     from formatflowed import convertToWrapped # only needed by archiver
@@ -142,7 +146,6 @@ class BulkThread(Thread):
                 print("%s: Warning: Could not bulk insert: %s into %s" % (self.id,err,self.dtype))
 #             print("%s: Inserted %u entries into %s" % (self.id, len(js_arr),self.dtype))
 
-
 class SlurpThread(Thread):
 
     def printid(self, message):
@@ -211,7 +214,7 @@ class SlurpThread(Thread):
                     except Exception as err:
                         self.printid("This wasn't a gzip file: %s" % err )
                 self.printid("Slurping %s" % filename)
-                messages = mailbox.mbox(tmpname)
+                messages = mailbox.mbox(tmpname, MboxoFactory)
 
             else:
                 ml = mla[0]

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/1fc60242/tools/mboxo_patch.py
----------------------------------------------------------------------
diff --git a/tools/mboxo_patch.py b/tools/mboxo_patch.py
new file mode 100644
index 0000000..20b5443
--- /dev/null
+++ b/tools/mboxo_patch.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Byte stream reader to process mboxo style mailbox files.
+These are not currently handled by the Python email package.
+
+It replaces any occurrence of b'\n>From ' with b'\nFrom '
+
+The class handles matching across read boundaries.
+
+To use:
+
+from mboxo_patch import MboxoFactory
+...
+messages = mailbox.mbox(filename, MboxoFactory)
+"""
+import mailbox
+
+FROM_MANGLED  =b'\n>From '
+FROM_MANGLED_LEN=len(FROM_MANGLED)
+FROM_UNMANGLED=b'\nFrom '
+# We want to match the 7 bytes b'\n>From ' in the input stream
+# However this can be split over multiple reads.
+# The split can occur anywhere after the leading b'\n'
+# and the trailing b' '. If we match any of these
+# we keep the trailing part of the buffer for next time
+# The following are all the possible prefixes for a split:
+FROMS=(FROM_MANGLED[:-1],
+       FROM_MANGLED[:-2],
+       FROM_MANGLED[:-3],
+       FROM_MANGLED[:-4],
+       FROM_MANGLED[:-5],
+       FROM_MANGLED[:-6],
+       )
+
+class MboxoReader(mailbox._PartialFile):
+    def __init__(self, f, start=None, stop=None):
+        self.remain=0 # number of bytes to keep for next read
+        super().__init__(f._file, start=f._start, stop=f._stop)
+
+    # Override the read method to provide mboxo filtering
+    def _read(self, size, read_method):
+        # get the next chunk, resetting if necessary 
+        if self.remain != 0:
+            super().seek(whence=1, offset=-self.remain)
+        # ensure we get enough to match successfully when refilling
+        size = size if size > FROM_MANGLED_LEN else FROM_MANGLED_LEN
+        bytes = super()._read(size, read_method)
+        bufflen=len(bytes)
+        # did we get anything new?
+        if bufflen > self.remain:
+            # is there a potential cross-boundary match?
+            if bytes.endswith(FROMS):
+                # yes, work out what to keep
+                # N.B. rindex will fail if it cannot find the LF;
+                # this should be impossible
+                self.remain=bufflen - bytes.rindex(b'\n')
+            else:
+                # don't need to keep anything back
+                self.remain=0
+        else:
+            # EOF
+            self.remain=0
+        # we cannot use -0 to mean end of array...
+        end = bufflen if self.remain == 0 else -self.remain
+        # exclude the potential split match from the return
+        return bytes[:end].replace(FROM_MANGLED, FROM_UNMANGLED)
+
+class MboxoFactory(mailbox.mboxMessage):
+    def __init__(self, message=None):
+        super().__init__(message=MboxoReader(message))