You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2017/06/04 14:02:01 UTC

[1/4] incubator-ponymail git commit: be more comprehensive in generating IDs

Repository: incubator-ponymail
Updated Branches:
  refs/heads/master 767d8f8c1 -> c58b23127


be more comprehensive in generating IDs

Include whatever metadata we can in generating IDs to lessen the
risk of theoretical ID collisions.


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/8b7ede85
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/8b7ede85
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/8b7ede85

Branch: refs/heads/master
Commit: 8b7ede85f03bd738d42e8e87f2302e4c8ddb0ad3
Parents: 767d8f8
Author: Daniel Gruno <hu...@apache.org>
Authored: Sun Jun 4 15:21:23 2017 +0200
Committer: Daniel Gruno <hu...@apache.org>
Committed: Sun Jun 4 15:21:23 2017 +0200

----------------------------------------------------------------------
 CHANGELOG.md      | 1 +
 tools/archiver.py | 3 +++
 2 files changed, 4 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/8b7ede85/CHANGELOG.md
----------------------------------------------------------------------
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 275b0df..be595de 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
 ## CHANGES in 0.10:
+- more comprehensive ID generation mechanisms
 - private messages are now included in downloads if the user has access to them (#169, #108)
 - mbox export now generates valid From_ line (#190)
 - mbox export now escapes 'From ' lines in body text (#188)

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/8b7ede85/tools/archiver.py
----------------------------------------------------------------------
diff --git a/tools/archiver.py b/tools/archiver.py
index 141a431..41933f7 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -324,6 +324,9 @@ class Archiver(object):
                     xbody += bytes(lid, encoding='ascii')
                     xbody += bytes(mdatestring, encoding='ascii')
                     mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
+                    if attachments:
+                        for a in attachments:
+                            xbody += bytes(a['hash'], encoding = 'ascii')
                 else:
                     # Or revert to the old way?
                     mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)


[3/4] incubator-ponymail git commit: make setup ask for generator mechanism

Posted by hu...@apache.org.
make setup ask for generator mechanism


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/23966d82
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/23966d82
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/23966d82

Branch: refs/heads/master
Commit: 23966d825801f0465fd00c881e1edddb5e529826
Parents: e2d8103
Author: Daniel Gruno <hu...@apache.org>
Authored: Sun Jun 4 15:58:33 2017 +0200
Committer: Daniel Gruno <hu...@apache.org>
Committed: Sun Jun 4 15:58:33 2017 +0200

----------------------------------------------------------------------
 tools/setup.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/23966d82/tools/setup.py
----------------------------------------------------------------------
diff --git a/tools/setup.py b/tools/setup.py
index e65069a..d5ca140 100755
--- a/tools/setup.py
+++ b/tools/setup.py
@@ -94,6 +94,7 @@ dbname = ""
 mlserver = ""
 mldom = ""
 wc = ""
+genname = ""
 wce = False
 shards = 0
 replicas = -1
@@ -154,6 +155,20 @@ while wc == "":
     if wc.lower() == "y":
         wce = True
 
+while genname == "":
+    gens = ['legacy', 'medium', 'redundant', 'full']
+    print ("Please select a document ID generator:")
+    print("1  LEGACY: The original document generator for v/0.1-0.8 (no longer recommended)")
+    print("2  MEDIUM: The medium comprehensive generator for v/0.9 (no longer recommended)")
+    print("3  REDUNDANT: Near-full message digest, discard MTA trail (recommended for clustered setups)")
+    print("4  FULL: Full message digest with MTA trail (recommended for single-node setups).")
+    try:
+        gno = int(input("Please select a generator [1-4]: "))
+        if gno <= len(gens) and gens[gno-1]:
+            genname = gens[gno-1]
+    except ValueError:
+        pass
+    
 while shards < 1:
     try:
         shards = int(input("How many shards for the ElasticSearch index? "))
@@ -475,14 +490,14 @@ ssl:                    false
 #backup:                database name
 
 [archiver]
-#generator:             medium|full|other
+generator:              %s
 
 [debug]
 #cropout:               string to crop from list-id
 
 ###############################################################
             """ % (hostname, dbname, port, 
-                   'wait:                  active shard count' if ES_MAJOR == 5 else 'write:                 consistency level (default quorum)'))
+                   'wait:                  active shard count' if ES_MAJOR == 5 else 'write:                 consistency level (default quorum)', genname))
     f.close()
 
 config_path = "../site/api/lib"


Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by Daniel Gruno <hu...@apache.org>.
On 06/05/2017 12:22 AM, sebb wrote:
> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>> split generators into a file of its own
>>
>> Also fix up generators:
>> - medium goes back to the way it was
>> - a new 'redundant' generator for cluster setups
>>
>>
>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>
>> Branch: refs/heads/master
>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>> Parents: 8b7ede8
>> Author: Daniel Gruno <hu...@apache.org>
>> Authored: Sun Jun 4 15:45:18 2017 +0200
>> Committer: Daniel Gruno <hu...@apache.org>
>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>
>> ----------------------------------------------------------------------
>>  tools/archiver.py   | 17 ++++-------
>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 80 insertions(+), 11 deletions(-)
>> ----------------------------------------------------------------------
>>
>>
>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>> ----------------------------------------------------------------------
>> diff --git a/tools/archiver.py b/tools/archiver.py
>> index 41933f7..0966b13 100755
>> --- a/tools/archiver.py
>> +++ b/tools/archiver.py
>> @@ -58,6 +58,7 @@ import io
>>  import logging
>>  import traceback
>>  import sys
>> +import generators
>>
>>  # Fetch config
>>  path = os.path.dirname(os.path.realpath(__file__))
>> @@ -316,20 +317,14 @@ class Archiver(object):
>>          if body is not None or attachments:
>>              pmid = mid
>>              try:
>> -                # Use full message as bytes for mid?
>>                  if archiver_generator == "full":
>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>> +                    mid = generators.full(msg, body, lid, attachments)
>>                  elif archiver_generator == "medium":
>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> -                    xbody += bytes(lid, encoding='ascii')
>> -                    xbody += bytes(mdatestring, encoding='ascii')
>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> -                    if attachments:
>> -                        for a in attachments:
>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>> +                    mid = generators.medium(msg, body, lid, attachments)
>> +                elif archiver_generator == "redundant":
>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>                  else:
>> -                    # Or revert to the old way?
>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>              except Exception as err:
>>                  if logger:
>>                      logger.warn("Could not generate MID: %s" % err)
>>
>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>> ----------------------------------------------------------------------
>> diff --git a/tools/generators.py b/tools/generators.py
>> new file mode 100644
>> index 0000000..af566fc
>> --- /dev/null
>> +++ b/tools/generators.py
>> @@ -0,0 +1,74 @@
>> +#!/usr/bin/env/python3
>> +# -*- coding: utf-8 -*-
>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>> +# contributor license agreements.  See the NOTICE file distributed with
>> +# this work for additional information regarding copyright ownership.
>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>> +# (the "License"); you may not use this file except in compliance with
>> +# the License.  You may obtain a copy of the License at
>> +#
>> +#     http://www.apache.org/licenses/LICENSE-2.0
>> +#
>> +# Unless required by applicable law or agreed to in writing, software
>> +# distributed under the License is distributed on an "AS IS" BASIS,
>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> +# See the License for the specific language governing permissions and
>> +# limitations under the License.
>> +
>> +"""
>> +This file contains the various ID generators for Pony Mail's archivers.
>> +"""
>> +
>> +import hashlib
>> +import email.utils
>> +
>> +# Full generator: uses the entire email (including sever-depenent data)
>> +# This is the recommended generator for single-node setups.
>> +def full(msg, body, lid, attachments):
>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>> +    return mid
>> +
>> +# Medium: Standard generator
>> +def medium(msg, body, lid, attachments):
>> +    # Use text body
>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> +    # Use List ID
>> +    xbody += bytes(lid, encoding='ascii')
>> +    # Use Date header
>> +    xbody += bytes(mdatestring, encoding='ascii')
>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> +    return mid
>> +
>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>> +# This is the recommended generator for redundant cluster setups
>> +def redundant(msg, body, lid, attachments):
>> +    # Use text body
>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> +    # Use List ID
>> +    xbody += bytes(lid, encoding='ascii')
>> +    # Use Date header
>> +    xbody += bytes(mdatestring, encoding='ascii')
>> +    # Use sender
>> +    sender = msg.get('from', None)
>> +    if sender:
>> +        xbody += bytes(sender, encoding = 'ascii')
>> +    # Use subject
>> +    if subject:
>> +        xbody += bytes(subject, encoding = 'ascii')
>> +    # Use attachment hashes if present
>> +    if attachments:
>> +        for a in attachments:
>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> +    return mid
>> +
>> +
>> +# Old school way of making IDs
>> +def legacy(msg, body, lid, attachments):
> 
> -1
> 
> AFAICT this is not exactly the same as the original code.
> 
>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
> 
> What happens if either of the previous two lines throws an error?

Good catch! Fixed in 2802e2905.

> 
>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>> +    return mid
>> +
>> +
>> +
>>


Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by sebb <se...@gmail.com>.
On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
> split generators into a file of its own
>
> Also fix up generators:
> - medium goes back to the way it was
> - a new 'redundant' generator for cluster setups
>
>
> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>
> Branch: refs/heads/master
> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
> Parents: 8b7ede8
> Author: Daniel Gruno <hu...@apache.org>
> Authored: Sun Jun 4 15:45:18 2017 +0200
> Committer: Daniel Gruno <hu...@apache.org>
> Committed: Sun Jun 4 15:45:18 2017 +0200
>
> ----------------------------------------------------------------------
>  tools/archiver.py   | 17 ++++-------
>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+), 11 deletions(-)
> ----------------------------------------------------------------------
>
>
> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
> ----------------------------------------------------------------------
> diff --git a/tools/archiver.py b/tools/archiver.py
> index 41933f7..0966b13 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -58,6 +58,7 @@ import io
>  import logging
>  import traceback
>  import sys
> +import generators
>
>  # Fetch config
>  path = os.path.dirname(os.path.realpath(__file__))
> @@ -316,20 +317,14 @@ class Archiver(object):
>          if body is not None or attachments:
>              pmid = mid
>              try:
> -                # Use full message as bytes for mid?
>                  if archiver_generator == "full":
> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
> +                    mid = generators.full(msg, body, lid, attachments)
>                  elif archiver_generator == "medium":
> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> -                    xbody += bytes(lid, encoding='ascii')
> -                    xbody += bytes(mdatestring, encoding='ascii')
> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> -                    if attachments:
> -                        for a in attachments:
> -                            xbody += bytes(a['hash'], encoding = 'ascii')
> +                    mid = generators.medium(msg, body, lid, attachments)
> +                elif archiver_generator == "redundant":
> +                    mid = generators.redundant(msg, body, lid, attachments)
>                  else:
> -                    # Or revert to the old way?
> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
> +                    mid = generators.legacy(msg, body, lid, attachments)
>              except Exception as err:
>                  if logger:
>                      logger.warn("Could not generate MID: %s" % err)
>
> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
> ----------------------------------------------------------------------
> diff --git a/tools/generators.py b/tools/generators.py
> new file mode 100644
> index 0000000..af566fc
> --- /dev/null
> +++ b/tools/generators.py
> @@ -0,0 +1,74 @@
> +#!/usr/bin/env/python3
> +# -*- coding: utf-8 -*-
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to You under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +
> +"""
> +This file contains the various ID generators for Pony Mail's archivers.
> +"""
> +
> +import hashlib
> +import email.utils
> +
> +# Full generator: uses the entire email (including sever-depenent data)
> +# This is the recommended generator for single-node setups.
> +def full(msg, body, lid, attachments):
> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
> +    return mid
> +
> +# Medium: Standard generator
> +def medium(msg, body, lid, attachments):
> +    # Use text body
> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> +    # Use List ID
> +    xbody += bytes(lid, encoding='ascii')
> +    # Use Date header
> +    xbody += bytes(mdatestring, encoding='ascii')
> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> +    return mid
> +
> +# Redundant: Use data that is guaranteed to be the same across redundant setups
> +# This is the recommended generator for redundant cluster setups
> +def redundant(msg, body, lid, attachments):
> +    # Use text body
> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> +    # Use List ID
> +    xbody += bytes(lid, encoding='ascii')
> +    # Use Date header
> +    xbody += bytes(mdatestring, encoding='ascii')
> +    # Use sender
> +    sender = msg.get('from', None)
> +    if sender:
> +        xbody += bytes(sender, encoding = 'ascii')
> +    # Use subject
> +    if subject:
> +        xbody += bytes(subject, encoding = 'ascii')
> +    # Use attachment hashes if present
> +    if attachments:
> +        for a in attachments:
> +            xbody += bytes(a['hash'], encoding = 'ascii')
> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> +    return mid
> +
> +
> +# Old school way of making IDs
> +def legacy(msg, body, lid, attachments):

-1

AFAICT this is not exactly the same as the original code.

> +    mdate = email.utils.parsedate_tz(msg.get('date'))
> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid

What happens if either of the previous two lines throws an error?

> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
> +    return mid
> +
> +
> +
>

Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by sebb <se...@gmail.com>.
On 4 June 2017 at 23:16, Daniel Gruno <hu...@apache.org> wrote:
> On 06/05/2017 12:07 AM, sebb wrote:
>> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>>> split generators into a file of its own
>>>
>>> Also fix up generators:
>>> - medium goes back to the way it was
>>
>> -1
>>
>> This is a very confusing change.
>>
>> The change to the medium generator should be reverted as a separate
>> commit, and the other changes added separately
>
> How would I go about dealing with that? I understand your objection to
> the commit style here, and I agree it should have been two separate
> commits, but I'm not sure I know how to rework that now.

Dunno, I'm not a Git expert.

>>
>>> - a new 'redundant' generator for cluster setups
>>>
>>>
>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>>
>>> Branch: refs/heads/master
>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>>> Parents: 8b7ede8
>>> Author: Daniel Gruno <hu...@apache.org>
>>> Authored: Sun Jun 4 15:45:18 2017 +0200
>>> Committer: Daniel Gruno <hu...@apache.org>
>>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>>
>>> ----------------------------------------------------------------------
>>>  tools/archiver.py   | 17 ++++-------
>>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 80 insertions(+), 11 deletions(-)
>>> ----------------------------------------------------------------------
>>>
>>>
>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>>> ----------------------------------------------------------------------
>>> diff --git a/tools/archiver.py b/tools/archiver.py
>>> index 41933f7..0966b13 100755
>>> --- a/tools/archiver.py
>>> +++ b/tools/archiver.py
>>> @@ -58,6 +58,7 @@ import io
>>>  import logging
>>>  import traceback
>>>  import sys
>>> +import generators
>>>
>>>  # Fetch config
>>>  path = os.path.dirname(os.path.realpath(__file__))
>>> @@ -316,20 +317,14 @@ class Archiver(object):
>>>          if body is not None or attachments:
>>>              pmid = mid
>>>              try:
>>> -                # Use full message as bytes for mid?
>>>                  if archiver_generator == "full":
>>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>> +                    mid = generators.full(msg, body, lid, attachments)
>>>                  elif archiver_generator == "medium":
>>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>> -                    xbody += bytes(lid, encoding='ascii')
>>> -                    xbody += bytes(mdatestring, encoding='ascii')
>>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>> -                    if attachments:
>>> -                        for a in attachments:
>>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>>> +                    mid = generators.medium(msg, body, lid, attachments)
>>> +                elif archiver_generator == "redundant":
>>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>>                  else:
>>> -                    # Or revert to the old way?
>>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>>              except Exception as err:
>>>                  if logger:
>>>                      logger.warn("Could not generate MID: %s" % err)
>>>
>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>>> ----------------------------------------------------------------------
>>> diff --git a/tools/generators.py b/tools/generators.py
>>> new file mode 100644
>>> index 0000000..af566fc
>>> --- /dev/null
>>> +++ b/tools/generators.py
>>> @@ -0,0 +1,74 @@
>>> +#!/usr/bin/env/python3
>>> +# -*- coding: utf-8 -*-
>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>> +# contributor license agreements.  See the NOTICE file distributed with
>>> +# this work for additional information regarding copyright ownership.
>>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>>> +# (the "License"); you may not use this file except in compliance with
>>> +# the License.  You may obtain a copy of the License at
>>> +#
>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>> +#
>>> +# Unless required by applicable law or agreed to in writing, software
>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> +# See the License for the specific language governing permissions and
>>> +# limitations under the License.
>>> +
>>> +"""
>>> +This file contains the various ID generators for Pony Mail's archivers.
>>> +"""
>>> +
>>> +import hashlib
>>> +import email.utils
>>> +
>>> +# Full generator: uses the entire email (including sever-depenent data)
>>> +# This is the recommended generator for single-node setups.
>>> +def full(msg, body, lid, attachments):
>>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>> +    return mid
>>> +
>>> +# Medium: Standard generator
>>> +def medium(msg, body, lid, attachments):
>>> +    # Use text body
>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>> +    # Use List ID
>>> +    xbody += bytes(lid, encoding='ascii')
>>> +    # Use Date header
>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>> +    return mid
>>> +
>>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>>> +# This is the recommended generator for redundant cluster setups
>>> +def redundant(msg, body, lid, attachments):
>>> +    # Use text body
>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>> +    # Use List ID
>>> +    xbody += bytes(lid, encoding='ascii')
>>> +    # Use Date header
>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>> +    # Use sender
>>> +    sender = msg.get('from', None)
>>> +    if sender:
>>> +        xbody += bytes(sender, encoding = 'ascii')
>>> +    # Use subject
>>> +    if subject:
>>> +        xbody += bytes(subject, encoding = 'ascii')
>>> +    # Use attachment hashes if present
>>> +    if attachments:
>>> +        for a in attachments:
>>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>> +    return mid
>>> +
>>> +
>>> +# Old school way of making IDs
>>> +def legacy(msg, body, lid, attachments):
>>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
>>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>> +    return mid
>>> +
>>> +
>>> +
>>>
>

Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by Daniel Gruno <hu...@apache.org>.
On 06/05/2017 12:07 AM, sebb wrote:
> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>> split generators into a file of its own
>>
>> Also fix up generators:
>> - medium goes back to the way it was
> 
> -1
> 
> This is a very confusing change.
> 
> The change to the medium generator should be reverted as a separate
> commit, and the other changes added separately

How would I go about dealing with that? I understand your objection to
the commit style here, and I agree it should have been two separate
commits, but I'm not sure I know how to rework that now.

> 
>> - a new 'redundant' generator for cluster setups
>>
>>
>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>
>> Branch: refs/heads/master
>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>> Parents: 8b7ede8
>> Author: Daniel Gruno <hu...@apache.org>
>> Authored: Sun Jun 4 15:45:18 2017 +0200
>> Committer: Daniel Gruno <hu...@apache.org>
>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>
>> ----------------------------------------------------------------------
>>  tools/archiver.py   | 17 ++++-------
>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 80 insertions(+), 11 deletions(-)
>> ----------------------------------------------------------------------
>>
>>
>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>> ----------------------------------------------------------------------
>> diff --git a/tools/archiver.py b/tools/archiver.py
>> index 41933f7..0966b13 100755
>> --- a/tools/archiver.py
>> +++ b/tools/archiver.py
>> @@ -58,6 +58,7 @@ import io
>>  import logging
>>  import traceback
>>  import sys
>> +import generators
>>
>>  # Fetch config
>>  path = os.path.dirname(os.path.realpath(__file__))
>> @@ -316,20 +317,14 @@ class Archiver(object):
>>          if body is not None or attachments:
>>              pmid = mid
>>              try:
>> -                # Use full message as bytes for mid?
>>                  if archiver_generator == "full":
>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>> +                    mid = generators.full(msg, body, lid, attachments)
>>                  elif archiver_generator == "medium":
>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> -                    xbody += bytes(lid, encoding='ascii')
>> -                    xbody += bytes(mdatestring, encoding='ascii')
>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> -                    if attachments:
>> -                        for a in attachments:
>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>> +                    mid = generators.medium(msg, body, lid, attachments)
>> +                elif archiver_generator == "redundant":
>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>                  else:
>> -                    # Or revert to the old way?
>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>              except Exception as err:
>>                  if logger:
>>                      logger.warn("Could not generate MID: %s" % err)
>>
>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>> ----------------------------------------------------------------------
>> diff --git a/tools/generators.py b/tools/generators.py
>> new file mode 100644
>> index 0000000..af566fc
>> --- /dev/null
>> +++ b/tools/generators.py
>> @@ -0,0 +1,74 @@
>> +#!/usr/bin/env/python3
>> +# -*- coding: utf-8 -*-
>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>> +# contributor license agreements.  See the NOTICE file distributed with
>> +# this work for additional information regarding copyright ownership.
>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>> +# (the "License"); you may not use this file except in compliance with
>> +# the License.  You may obtain a copy of the License at
>> +#
>> +#     http://www.apache.org/licenses/LICENSE-2.0
>> +#
>> +# Unless required by applicable law or agreed to in writing, software
>> +# distributed under the License is distributed on an "AS IS" BASIS,
>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> +# See the License for the specific language governing permissions and
>> +# limitations under the License.
>> +
>> +"""
>> +This file contains the various ID generators for Pony Mail's archivers.
>> +"""
>> +
>> +import hashlib
>> +import email.utils
>> +
>> +# Full generator: uses the entire email (including sever-depenent data)
>> +# This is the recommended generator for single-node setups.
>> +def full(msg, body, lid, attachments):
>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>> +    return mid
>> +
>> +# Medium: Standard generator
>> +def medium(msg, body, lid, attachments):
>> +    # Use text body
>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> +    # Use List ID
>> +    xbody += bytes(lid, encoding='ascii')
>> +    # Use Date header
>> +    xbody += bytes(mdatestring, encoding='ascii')
>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> +    return mid
>> +
>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>> +# This is the recommended generator for redundant cluster setups
>> +def redundant(msg, body, lid, attachments):
>> +    # Use text body
>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> +    # Use List ID
>> +    xbody += bytes(lid, encoding='ascii')
>> +    # Use Date header
>> +    xbody += bytes(mdatestring, encoding='ascii')
>> +    # Use sender
>> +    sender = msg.get('from', None)
>> +    if sender:
>> +        xbody += bytes(sender, encoding = 'ascii')
>> +    # Use subject
>> +    if subject:
>> +        xbody += bytes(subject, encoding = 'ascii')
>> +    # Use attachment hashes if present
>> +    if attachments:
>> +        for a in attachments:
>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> +    return mid
>> +
>> +
>> +# Old school way of making IDs
>> +def legacy(msg, body, lid, attachments):
>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>> +    return mid
>> +
>> +
>> +
>>


Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by sebb <se...@gmail.com>.
On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
> split generators into a file of its own
>
> Also fix up generators:
> - medium goes back to the way it was

-1

This is a very confusing change.

The change to the medium generator should be reverted as a separate
commit, and the other changes added separately

> - a new 'redundant' generator for cluster setups
>
>
> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>
> Branch: refs/heads/master
> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
> Parents: 8b7ede8
> Author: Daniel Gruno <hu...@apache.org>
> Authored: Sun Jun 4 15:45:18 2017 +0200
> Committer: Daniel Gruno <hu...@apache.org>
> Committed: Sun Jun 4 15:45:18 2017 +0200
>
> ----------------------------------------------------------------------
>  tools/archiver.py   | 17 ++++-------
>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+), 11 deletions(-)
> ----------------------------------------------------------------------
>
>
> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
> ----------------------------------------------------------------------
> diff --git a/tools/archiver.py b/tools/archiver.py
> index 41933f7..0966b13 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -58,6 +58,7 @@ import io
>  import logging
>  import traceback
>  import sys
> +import generators
>
>  # Fetch config
>  path = os.path.dirname(os.path.realpath(__file__))
> @@ -316,20 +317,14 @@ class Archiver(object):
>          if body is not None or attachments:
>              pmid = mid
>              try:
> -                # Use full message as bytes for mid?
>                  if archiver_generator == "full":
> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
> +                    mid = generators.full(msg, body, lid, attachments)
>                  elif archiver_generator == "medium":
> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> -                    xbody += bytes(lid, encoding='ascii')
> -                    xbody += bytes(mdatestring, encoding='ascii')
> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> -                    if attachments:
> -                        for a in attachments:
> -                            xbody += bytes(a['hash'], encoding = 'ascii')
> +                    mid = generators.medium(msg, body, lid, attachments)
> +                elif archiver_generator == "redundant":
> +                    mid = generators.redundant(msg, body, lid, attachments)
>                  else:
> -                    # Or revert to the old way?
> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
> +                    mid = generators.legacy(msg, body, lid, attachments)
>              except Exception as err:
>                  if logger:
>                      logger.warn("Could not generate MID: %s" % err)
>
> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
> ----------------------------------------------------------------------
> diff --git a/tools/generators.py b/tools/generators.py
> new file mode 100644
> index 0000000..af566fc
> --- /dev/null
> +++ b/tools/generators.py
> @@ -0,0 +1,74 @@
> +#!/usr/bin/env/python3
> +# -*- coding: utf-8 -*-
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to You under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +
> +"""
> +This file contains the various ID generators for Pony Mail's archivers.
> +"""
> +
> +import hashlib
> +import email.utils
> +
> +# Full generator: uses the entire email (including sever-depenent data)
> +# This is the recommended generator for single-node setups.
> +def full(msg, body, lid, attachments):
> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
> +    return mid
> +
> +# Medium: Standard generator
> +def medium(msg, body, lid, attachments):
> +    # Use text body
> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> +    # Use List ID
> +    xbody += bytes(lid, encoding='ascii')
> +    # Use Date header
> +    xbody += bytes(mdatestring, encoding='ascii')
> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> +    return mid
> +
> +# Redundant: Use data that is guaranteed to be the same across redundant setups
> +# This is the recommended generator for redundant cluster setups
> +def redundant(msg, body, lid, attachments):
> +    # Use text body
> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> +    # Use List ID
> +    xbody += bytes(lid, encoding='ascii')
> +    # Use Date header
> +    xbody += bytes(mdatestring, encoding='ascii')
> +    # Use sender
> +    sender = msg.get('from', None)
> +    if sender:
> +        xbody += bytes(sender, encoding = 'ascii')
> +    # Use subject
> +    if subject:
> +        xbody += bytes(subject, encoding = 'ascii')
> +    # Use attachment hashes if present
> +    if attachments:
> +        for a in attachments:
> +            xbody += bytes(a['hash'], encoding = 'ascii')
> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> +    return mid
> +
> +
> +# Old school way of making IDs
> +def legacy(msg, body, lid, attachments):
> +    mdate = email.utils.parsedate_tz(msg.get('date'))
> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
> +    return mid
> +
> +
> +
>

Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by sebb <se...@gmail.com>.
On 5 June 2017 at 10:04, Daniel Gruno <hu...@apache.org> wrote:
> On 06/05/2017 11:01 AM, sebb wrote:
>> On 5 June 2017 at 09:17, Daniel Gruno <hu...@apache.org> wrote:
>>> I missed a git add in the last commit, sorry. Will add and recommit now.
>>
>> Have you tested that the change is complete?
>>
>> I'm still getting an error.
>
> Tested it with a bunch of mbox files, some with, some without headers,
> subjects, senders etc. All seemed to work.
>
> What is the specific error you are getting, and which generator are you
> using?

I am using the medium generator.

NameError: name 'msg_metadata' is not defined

>>
>>> On 06/05/2017 01:57 AM, sebb wrote:
>>>> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>>>>> split generators into a file of its own
>>>>>
>>>>> Also fix up generators:
>>>>> - medium goes back to the way it was
>>>>> - a new 'redundant' generator for cluster setups
>>>>>
>>>>>
>>>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>>>>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>>>>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>>>>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>>>>
>>>>> Branch: refs/heads/master
>>>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>>>>> Parents: 8b7ede8
>>>>> Author: Daniel Gruno <hu...@apache.org>
>>>>> Authored: Sun Jun 4 15:45:18 2017 +0200
>>>>> Committer: Daniel Gruno <hu...@apache.org>
>>>>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>>>>
>>>>> ----------------------------------------------------------------------
>>>>>  tools/archiver.py   | 17 ++++-------
>>>>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  2 files changed, 80 insertions(+), 11 deletions(-)
>>>>> ----------------------------------------------------------------------
>>>>>
>>>>>
>>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>>>>> ----------------------------------------------------------------------
>>>>> diff --git a/tools/archiver.py b/tools/archiver.py
>>>>> index 41933f7..0966b13 100755
>>>>> --- a/tools/archiver.py
>>>>> +++ b/tools/archiver.py
>>>>> @@ -58,6 +58,7 @@ import io
>>>>>  import logging
>>>>>  import traceback
>>>>>  import sys
>>>>> +import generators
>>>>>
>>>>>  # Fetch config
>>>>>  path = os.path.dirname(os.path.realpath(__file__))
>>>>> @@ -316,20 +317,14 @@ class Archiver(object):
>>>>>          if body is not None or attachments:
>>>>>              pmid = mid
>>>>>              try:
>>>>> -                # Use full message as bytes for mid?
>>>>>                  if archiver_generator == "full":
>>>>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>>>> +                    mid = generators.full(msg, body, lid, attachments)
>>>>>                  elif archiver_generator == "medium":
>>>>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>>>> -                    xbody += bytes(lid, encoding='ascii')
>>>>> -                    xbody += bytes(mdatestring, encoding='ascii')
>>>>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>>> -                    if attachments:
>>>>> -                        for a in attachments:
>>>>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>>>>> +                    mid = generators.medium(msg, body, lid, attachments)
>>>>> +                elif archiver_generator == "redundant":
>>>>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>>>>                  else:
>>>>> -                    # Or revert to the old way?
>>>>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>>>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>>>>              except Exception as err:
>>>>>                  if logger:
>>>>>                      logger.warn("Could not generate MID: %s" % err)
>>>>>
>>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>>>>> ----------------------------------------------------------------------
>>>>> diff --git a/tools/generators.py b/tools/generators.py
>>>>> new file mode 100644
>>>>> index 0000000..af566fc
>>>>> --- /dev/null
>>>>> +++ b/tools/generators.py
>>>>> @@ -0,0 +1,74 @@
>>>>> +#!/usr/bin/env/python3
>>>>> +# -*- coding: utf-8 -*-
>>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>>> +# this work for additional information regarding copyright ownership.
>>>>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>>>>> +# (the "License"); you may not use this file except in compliance with
>>>>> +# the License.  You may obtain a copy of the License at
>>>>> +#
>>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>>> +#
>>>>> +# Unless required by applicable law or agreed to in writing, software
>>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>>> +# See the License for the specific language governing permissions and
>>>>> +# limitations under the License.
>>>>> +
>>>>> +"""
>>>>> +This file contains the various ID generators for Pony Mail's archivers.
>>>>> +"""
>>>>> +
>>>>> +import hashlib
>>>>> +import email.utils
>>>>> +
>>>>> +# Full generator: uses the entire email (including sever-depenent data)
>>>>> +# This is the recommended generator for single-node setups.
>>>>> +def full(msg, body, lid, attachments):
>>>>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>>>> +    return mid
>>>>> +
>>>>> +# Medium: Standard generator
>>>>> +def medium(msg, body, lid, attachments):
>>>>> +    # Use text body
>>>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>>>> +    # Use List ID
>>>>> +    xbody += bytes(lid, encoding='ascii')
>>>>> +    # Use Date header
>>>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>>>
>>>> mdatestring is not defined
>>>>
>>>>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>>> +    return mid
>>>>> +
>>>>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>>>>> +# This is the recommended generator for redundant cluster setups
>>>>> +def redundant(msg, body, lid, attachments):
>>>>> +    # Use text body
>>>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>>>> +    # Use List ID
>>>>> +    xbody += bytes(lid, encoding='ascii')
>>>>> +    # Use Date header
>>>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>>>
>>>> mdatestring is not defined
>>>>
>>>>> +    # Use sender
>>>>> +    sender = msg.get('from', None)
>>>>> +    if sender:
>>>>> +        xbody += bytes(sender, encoding = 'ascii')
>>>>> +    # Use subject
>>>>> +    if subject:
>>>>> +        xbody += bytes(subject, encoding = 'ascii')
>>>>> +    # Use attachment hashes if present
>>>>> +    if attachments:
>>>>> +        for a in attachments:
>>>>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>>>>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>>> +    return mid
>>>>> +
>>>>> +
>>>>> +# Old school way of making IDs
>>>>> +def legacy(msg, body, lid, attachments):
>>>>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>>>>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
>>>>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>>>> +    return mid
>>>>> +
>>>>> +
>>>>> +
>>>>
>>>> Have the generators been tested?
>>>>
>>>
>

Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by Daniel Gruno <hu...@apache.org>.
On 06/05/2017 11:01 AM, sebb wrote:
> On 5 June 2017 at 09:17, Daniel Gruno <hu...@apache.org> wrote:
>> I missed a git add in the last commit, sorry. Will add and recommit now.
> 
> Have you tested that the change is complete?
> 
> I'm still getting an error.

Tested it with a bunch of mbox files, some with, some without headers,
subjects, senders etc. All seemed to work.

What is the specific error you are getting, and which generator are you
using?

> 
>> On 06/05/2017 01:57 AM, sebb wrote:
>>> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>>>> split generators into a file of its own
>>>>
>>>> Also fix up generators:
>>>> - medium goes back to the way it was
>>>> - a new 'redundant' generator for cluster setups
>>>>
>>>>
>>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>>>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>>>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>>>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>>>
>>>> Branch: refs/heads/master
>>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>>>> Parents: 8b7ede8
>>>> Author: Daniel Gruno <hu...@apache.org>
>>>> Authored: Sun Jun 4 15:45:18 2017 +0200
>>>> Committer: Daniel Gruno <hu...@apache.org>
>>>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>>>
>>>> ----------------------------------------------------------------------
>>>>  tools/archiver.py   | 17 ++++-------
>>>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 80 insertions(+), 11 deletions(-)
>>>> ----------------------------------------------------------------------
>>>>
>>>>
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>>>> ----------------------------------------------------------------------
>>>> diff --git a/tools/archiver.py b/tools/archiver.py
>>>> index 41933f7..0966b13 100755
>>>> --- a/tools/archiver.py
>>>> +++ b/tools/archiver.py
>>>> @@ -58,6 +58,7 @@ import io
>>>>  import logging
>>>>  import traceback
>>>>  import sys
>>>> +import generators
>>>>
>>>>  # Fetch config
>>>>  path = os.path.dirname(os.path.realpath(__file__))
>>>> @@ -316,20 +317,14 @@ class Archiver(object):
>>>>          if body is not None or attachments:
>>>>              pmid = mid
>>>>              try:
>>>> -                # Use full message as bytes for mid?
>>>>                  if archiver_generator == "full":
>>>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>>> +                    mid = generators.full(msg, body, lid, attachments)
>>>>                  elif archiver_generator == "medium":
>>>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>>> -                    xbody += bytes(lid, encoding='ascii')
>>>> -                    xbody += bytes(mdatestring, encoding='ascii')
>>>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>> -                    if attachments:
>>>> -                        for a in attachments:
>>>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>>>> +                    mid = generators.medium(msg, body, lid, attachments)
>>>> +                elif archiver_generator == "redundant":
>>>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>>>                  else:
>>>> -                    # Or revert to the old way?
>>>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>>>              except Exception as err:
>>>>                  if logger:
>>>>                      logger.warn("Could not generate MID: %s" % err)
>>>>
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>>>> ----------------------------------------------------------------------
>>>> diff --git a/tools/generators.py b/tools/generators.py
>>>> new file mode 100644
>>>> index 0000000..af566fc
>>>> --- /dev/null
>>>> +++ b/tools/generators.py
>>>> @@ -0,0 +1,74 @@
>>>> +#!/usr/bin/env/python3
>>>> +# -*- coding: utf-8 -*-
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +
>>>> +"""
>>>> +This file contains the various ID generators for Pony Mail's archivers.
>>>> +"""
>>>> +
>>>> +import hashlib
>>>> +import email.utils
>>>> +
>>>> +# Full generator: uses the entire email (including sever-depenent data)
>>>> +# This is the recommended generator for single-node setups.
>>>> +def full(msg, body, lid, attachments):
>>>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>>> +    return mid
>>>> +
>>>> +# Medium: Standard generator
>>>> +def medium(msg, body, lid, attachments):
>>>> +    # Use text body
>>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>>> +    # Use List ID
>>>> +    xbody += bytes(lid, encoding='ascii')
>>>> +    # Use Date header
>>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>>
>>> mdatestring is not defined
>>>
>>>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>> +    return mid
>>>> +
>>>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>>>> +# This is the recommended generator for redundant cluster setups
>>>> +def redundant(msg, body, lid, attachments):
>>>> +    # Use text body
>>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>>> +    # Use List ID
>>>> +    xbody += bytes(lid, encoding='ascii')
>>>> +    # Use Date header
>>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>>
>>> mdatestring is not defined
>>>
>>>> +    # Use sender
>>>> +    sender = msg.get('from', None)
>>>> +    if sender:
>>>> +        xbody += bytes(sender, encoding = 'ascii')
>>>> +    # Use subject
>>>> +    if subject:
>>>> +        xbody += bytes(subject, encoding = 'ascii')
>>>> +    # Use attachment hashes if present
>>>> +    if attachments:
>>>> +        for a in attachments:
>>>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>>>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>> +    return mid
>>>> +
>>>> +
>>>> +# Old school way of making IDs
>>>> +def legacy(msg, body, lid, attachments):
>>>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>>>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
>>>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>>> +    return mid
>>>> +
>>>> +
>>>> +
>>>
>>> Have the generators been tested?
>>>
>>


Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by sebb <se...@gmail.com>.
On 5 June 2017 at 09:17, Daniel Gruno <hu...@apache.org> wrote:
> I missed a git add in the last commit, sorry. Will add and recommit now.

Have you tested that the change is complete?

I'm still getting an error.

> On 06/05/2017 01:57 AM, sebb wrote:
>> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>>> split generators into a file of its own
>>>
>>> Also fix up generators:
>>> - medium goes back to the way it was
>>> - a new 'redundant' generator for cluster setups
>>>
>>>
>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>>
>>> Branch: refs/heads/master
>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>>> Parents: 8b7ede8
>>> Author: Daniel Gruno <hu...@apache.org>
>>> Authored: Sun Jun 4 15:45:18 2017 +0200
>>> Committer: Daniel Gruno <hu...@apache.org>
>>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>>
>>> ----------------------------------------------------------------------
>>>  tools/archiver.py   | 17 ++++-------
>>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 80 insertions(+), 11 deletions(-)
>>> ----------------------------------------------------------------------
>>>
>>>
>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>>> ----------------------------------------------------------------------
>>> diff --git a/tools/archiver.py b/tools/archiver.py
>>> index 41933f7..0966b13 100755
>>> --- a/tools/archiver.py
>>> +++ b/tools/archiver.py
>>> @@ -58,6 +58,7 @@ import io
>>>  import logging
>>>  import traceback
>>>  import sys
>>> +import generators
>>>
>>>  # Fetch config
>>>  path = os.path.dirname(os.path.realpath(__file__))
>>> @@ -316,20 +317,14 @@ class Archiver(object):
>>>          if body is not None or attachments:
>>>              pmid = mid
>>>              try:
>>> -                # Use full message as bytes for mid?
>>>                  if archiver_generator == "full":
>>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>> +                    mid = generators.full(msg, body, lid, attachments)
>>>                  elif archiver_generator == "medium":
>>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>> -                    xbody += bytes(lid, encoding='ascii')
>>> -                    xbody += bytes(mdatestring, encoding='ascii')
>>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>> -                    if attachments:
>>> -                        for a in attachments:
>>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>>> +                    mid = generators.medium(msg, body, lid, attachments)
>>> +                elif archiver_generator == "redundant":
>>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>>                  else:
>>> -                    # Or revert to the old way?
>>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>>              except Exception as err:
>>>                  if logger:
>>>                      logger.warn("Could not generate MID: %s" % err)
>>>
>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>>> ----------------------------------------------------------------------
>>> diff --git a/tools/generators.py b/tools/generators.py
>>> new file mode 100644
>>> index 0000000..af566fc
>>> --- /dev/null
>>> +++ b/tools/generators.py
>>> @@ -0,0 +1,74 @@
>>> +#!/usr/bin/env/python3
>>> +# -*- coding: utf-8 -*-
>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>> +# contributor license agreements.  See the NOTICE file distributed with
>>> +# this work for additional information regarding copyright ownership.
>>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>>> +# (the "License"); you may not use this file except in compliance with
>>> +# the License.  You may obtain a copy of the License at
>>> +#
>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>> +#
>>> +# Unless required by applicable law or agreed to in writing, software
>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> +# See the License for the specific language governing permissions and
>>> +# limitations under the License.
>>> +
>>> +"""
>>> +This file contains the various ID generators for Pony Mail's archivers.
>>> +"""
>>> +
>>> +import hashlib
>>> +import email.utils
>>> +
>>> +# Full generator: uses the entire email (including sever-depenent data)
>>> +# This is the recommended generator for single-node setups.
>>> +def full(msg, body, lid, attachments):
>>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>> +    return mid
>>> +
>>> +# Medium: Standard generator
>>> +def medium(msg, body, lid, attachments):
>>> +    # Use text body
>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>> +    # Use List ID
>>> +    xbody += bytes(lid, encoding='ascii')
>>> +    # Use Date header
>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>
>> mdatestring is not defined
>>
>>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>> +    return mid
>>> +
>>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>>> +# This is the recommended generator for redundant cluster setups
>>> +def redundant(msg, body, lid, attachments):
>>> +    # Use text body
>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>>> +    # Use List ID
>>> +    xbody += bytes(lid, encoding='ascii')
>>> +    # Use Date header
>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>
>> mdatestring is not defined
>>
>>> +    # Use sender
>>> +    sender = msg.get('from', None)
>>> +    if sender:
>>> +        xbody += bytes(sender, encoding = 'ascii')
>>> +    # Use subject
>>> +    if subject:
>>> +        xbody += bytes(subject, encoding = 'ascii')
>>> +    # Use attachment hashes if present
>>> +    if attachments:
>>> +        for a in attachments:
>>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>> +    return mid
>>> +
>>> +
>>> +# Old school way of making IDs
>>> +def legacy(msg, body, lid, attachments):
>>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
>>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>> +    return mid
>>> +
>>> +
>>> +
>>
>> Have the generators been tested?
>>
>

Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by Daniel Gruno <hu...@apache.org>.
I missed a git add in the last commit, sorry. Will add and recommit now.

On 06/05/2017 01:57 AM, sebb wrote:
> On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
>> split generators into a file of its own
>>
>> Also fix up generators:
>> - medium goes back to the way it was
>> - a new 'redundant' generator for cluster setups
>>
>>
>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>
>> Branch: refs/heads/master
>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>> Parents: 8b7ede8
>> Author: Daniel Gruno <hu...@apache.org>
>> Authored: Sun Jun 4 15:45:18 2017 +0200
>> Committer: Daniel Gruno <hu...@apache.org>
>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>
>> ----------------------------------------------------------------------
>>  tools/archiver.py   | 17 ++++-------
>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 80 insertions(+), 11 deletions(-)
>> ----------------------------------------------------------------------
>>
>>
>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>> ----------------------------------------------------------------------
>> diff --git a/tools/archiver.py b/tools/archiver.py
>> index 41933f7..0966b13 100755
>> --- a/tools/archiver.py
>> +++ b/tools/archiver.py
>> @@ -58,6 +58,7 @@ import io
>>  import logging
>>  import traceback
>>  import sys
>> +import generators
>>
>>  # Fetch config
>>  path = os.path.dirname(os.path.realpath(__file__))
>> @@ -316,20 +317,14 @@ class Archiver(object):
>>          if body is not None or attachments:
>>              pmid = mid
>>              try:
>> -                # Use full message as bytes for mid?
>>                  if archiver_generator == "full":
>> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>> +                    mid = generators.full(msg, body, lid, attachments)
>>                  elif archiver_generator == "medium":
>> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> -                    xbody += bytes(lid, encoding='ascii')
>> -                    xbody += bytes(mdatestring, encoding='ascii')
>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> -                    if attachments:
>> -                        for a in attachments:
>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>> +                    mid = generators.medium(msg, body, lid, attachments)
>> +                elif archiver_generator == "redundant":
>> +                    mid = generators.redundant(msg, body, lid, attachments)
>>                  else:
>> -                    # Or revert to the old way?
>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>              except Exception as err:
>>                  if logger:
>>                      logger.warn("Could not generate MID: %s" % err)
>>
>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>> ----------------------------------------------------------------------
>> diff --git a/tools/generators.py b/tools/generators.py
>> new file mode 100644
>> index 0000000..af566fc
>> --- /dev/null
>> +++ b/tools/generators.py
>> @@ -0,0 +1,74 @@
>> +#!/usr/bin/env/python3
>> +# -*- coding: utf-8 -*-
>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>> +# contributor license agreements.  See the NOTICE file distributed with
>> +# this work for additional information regarding copyright ownership.
>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>> +# (the "License"); you may not use this file except in compliance with
>> +# the License.  You may obtain a copy of the License at
>> +#
>> +#     http://www.apache.org/licenses/LICENSE-2.0
>> +#
>> +# Unless required by applicable law or agreed to in writing, software
>> +# distributed under the License is distributed on an "AS IS" BASIS,
>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> +# See the License for the specific language governing permissions and
>> +# limitations under the License.
>> +
>> +"""
>> +This file contains the various ID generators for Pony Mail's archivers.
>> +"""
>> +
>> +import hashlib
>> +import email.utils
>> +
>> +# Full generator: uses the entire email (including sever-depenent data)
>> +# This is the recommended generator for single-node setups.
>> +def full(msg, body, lid, attachments):
>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>> +    return mid
>> +
>> +# Medium: Standard generator
>> +def medium(msg, body, lid, attachments):
>> +    # Use text body
>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> +    # Use List ID
>> +    xbody += bytes(lid, encoding='ascii')
>> +    # Use Date header
>> +    xbody += bytes(mdatestring, encoding='ascii')
> 
> mdatestring is not defined
> 
>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> +    return mid
>> +
>> +# Redundant: Use data that is guaranteed to be the same across redundant setups
>> +# This is the recommended generator for redundant cluster setups
>> +def redundant(msg, body, lid, attachments):
>> +    # Use text body
>> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
>> +    # Use List ID
>> +    xbody += bytes(lid, encoding='ascii')
>> +    # Use Date header
>> +    xbody += bytes(mdatestring, encoding='ascii')
> 
> mdatestring is not defined
> 
>> +    # Use sender
>> +    sender = msg.get('from', None)
>> +    if sender:
>> +        xbody += bytes(sender, encoding = 'ascii')
>> +    # Use subject
>> +    if subject:
>> +        xbody += bytes(subject, encoding = 'ascii')
>> +    # Use attachment hashes if present
>> +    if attachments:
>> +        for a in attachments:
>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>> +    return mid
>> +
>> +
>> +# Old school way of making IDs
>> +def legacy(msg, body, lid, attachments):
>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>> +    return mid
>> +
>> +
>> +
> 
> Have the generators been tested?
> 


Re: [2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by sebb <se...@gmail.com>.
On 4 June 2017 at 15:02,  <hu...@apache.org> wrote:
> split generators into a file of its own
>
> Also fix up generators:
> - medium goes back to the way it was
> - a new 'redundant' generator for cluster setups
>
>
> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
> Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
> Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
> Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>
> Branch: refs/heads/master
> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
> Parents: 8b7ede8
> Author: Daniel Gruno <hu...@apache.org>
> Authored: Sun Jun 4 15:45:18 2017 +0200
> Committer: Daniel Gruno <hu...@apache.org>
> Committed: Sun Jun 4 15:45:18 2017 +0200
>
> ----------------------------------------------------------------------
>  tools/archiver.py   | 17 ++++-------
>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+), 11 deletions(-)
> ----------------------------------------------------------------------
>
>
> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
> ----------------------------------------------------------------------
> diff --git a/tools/archiver.py b/tools/archiver.py
> index 41933f7..0966b13 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -58,6 +58,7 @@ import io
>  import logging
>  import traceback
>  import sys
> +import generators
>
>  # Fetch config
>  path = os.path.dirname(os.path.realpath(__file__))
> @@ -316,20 +317,14 @@ class Archiver(object):
>          if body is not None or attachments:
>              pmid = mid
>              try:
> -                # Use full message as bytes for mid?
>                  if archiver_generator == "full":
> -                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
> +                    mid = generators.full(msg, body, lid, attachments)
>                  elif archiver_generator == "medium":
> -                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> -                    xbody += bytes(lid, encoding='ascii')
> -                    xbody += bytes(mdatestring, encoding='ascii')
> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> -                    if attachments:
> -                        for a in attachments:
> -                            xbody += bytes(a['hash'], encoding = 'ascii')
> +                    mid = generators.medium(msg, body, lid, attachments)
> +                elif archiver_generator == "redundant":
> +                    mid = generators.redundant(msg, body, lid, attachments)
>                  else:
> -                    # Or revert to the old way?
> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
> +                    mid = generators.legacy(msg, body, lid, attachments)
>              except Exception as err:
>                  if logger:
>                      logger.warn("Could not generate MID: %s" % err)
>
> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
> ----------------------------------------------------------------------
> diff --git a/tools/generators.py b/tools/generators.py
> new file mode 100644
> index 0000000..af566fc
> --- /dev/null
> +++ b/tools/generators.py
> @@ -0,0 +1,74 @@
> +#!/usr/bin/env/python3
> +# -*- coding: utf-8 -*-
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to You under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +
> +"""
> +This file contains the various ID generators for Pony Mail's archivers.
> +"""
> +
> +import hashlib
> +import email.utils
> +
> +# Full generator: uses the entire email (including sever-depenent data)
> +# This is the recommended generator for single-node setups.
> +def full(msg, body, lid, attachments):
> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
> +    return mid
> +
> +# Medium: Standard generator
> +def medium(msg, body, lid, attachments):
> +    # Use text body
> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> +    # Use List ID
> +    xbody += bytes(lid, encoding='ascii')
> +    # Use Date header
> +    xbody += bytes(mdatestring, encoding='ascii')

mdatestring is not defined

> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> +    return mid
> +
> +# Redundant: Use data that is guaranteed to be the same across redundant setups
> +# This is the recommended generator for redundant cluster setups
> +def redundant(msg, body, lid, attachments):
> +    # Use text body
> +    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
> +    # Use List ID
> +    xbody += bytes(lid, encoding='ascii')
> +    # Use Date header
> +    xbody += bytes(mdatestring, encoding='ascii')

mdatestring is not defined

> +    # Use sender
> +    sender = msg.get('from', None)
> +    if sender:
> +        xbody += bytes(sender, encoding = 'ascii')
> +    # Use subject
> +    if subject:
> +        xbody += bytes(subject, encoding = 'ascii')
> +    # Use attachment hashes if present
> +    if attachments:
> +        for a in attachments:
> +            xbody += bytes(a['hash'], encoding = 'ascii')
> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
> +    return mid
> +
> +
> +# Old school way of making IDs
> +def legacy(msg, body, lid, attachments):
> +    mdate = email.utils.parsedate_tz(msg.get('date'))
> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
> +    return mid
> +
> +
> +

Have the generators been tested?

[2/4] incubator-ponymail git commit: split generators into a file of its own

Posted by hu...@apache.org.
split generators into a file of its own

Also fix up generators:
- medium goes back to the way it was
- a new 'redundant' generator for cluster setups


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036

Branch: refs/heads/master
Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
Parents: 8b7ede8
Author: Daniel Gruno <hu...@apache.org>
Authored: Sun Jun 4 15:45:18 2017 +0200
Committer: Daniel Gruno <hu...@apache.org>
Committed: Sun Jun 4 15:45:18 2017 +0200

----------------------------------------------------------------------
 tools/archiver.py   | 17 ++++-------
 tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
----------------------------------------------------------------------
diff --git a/tools/archiver.py b/tools/archiver.py
index 41933f7..0966b13 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -58,6 +58,7 @@ import io
 import logging
 import traceback
 import sys
+import generators
 
 # Fetch config
 path = os.path.dirname(os.path.realpath(__file__))
@@ -316,20 +317,14 @@ class Archiver(object):
         if body is not None or attachments:
             pmid = mid
             try:
-                # Use full message as bytes for mid?
                 if archiver_generator == "full":
-                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
+                    mid = generators.full(msg, body, lid, attachments)
                 elif archiver_generator == "medium":
-                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
-                    xbody += bytes(lid, encoding='ascii')
-                    xbody += bytes(mdatestring, encoding='ascii')
-                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
-                    if attachments:
-                        for a in attachments:
-                            xbody += bytes(a['hash'], encoding = 'ascii')
+                    mid = generators.medium(msg, body, lid, attachments)
+                elif archiver_generator == "redundant":
+                    mid = generators.redundant(msg, body, lid, attachments)
                 else:
-                    # Or revert to the old way?
-                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
+                    mid = generators.legacy(msg, body, lid, attachments)
             except Exception as err:
                 if logger:
                     logger.warn("Could not generate MID: %s" % err)

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
----------------------------------------------------------------------
diff --git a/tools/generators.py b/tools/generators.py
new file mode 100644
index 0000000..af566fc
--- /dev/null
+++ b/tools/generators.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env/python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file contains the various ID generators for Pony Mail's archivers.
+"""
+
+import hashlib
+import email.utils
+
+# Full generator: uses the entire email (including sever-depenent data)
+# This is the recommended generator for single-node setups.
+def full(msg, body, lid, attachments):
+    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
+    return mid
+
+# Medium: Standard generator
+def medium(msg, body, lid, attachments):
+    # Use text body
+    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+    # Use List ID
+    xbody += bytes(lid, encoding='ascii')
+    # Use Date header
+    xbody += bytes(mdatestring, encoding='ascii')
+    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
+    return mid
+
+# Redundant: Use data that is guaranteed to be the same across redundant setups
+# This is the recommended generator for redundant cluster setups
+def redundant(msg, body, lid, attachments):
+    # Use text body
+    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
+    # Use List ID
+    xbody += bytes(lid, encoding='ascii')
+    # Use Date header
+    xbody += bytes(mdatestring, encoding='ascii')
+    # Use sender
+    sender = msg.get('from', None)
+    if sender:
+        xbody += bytes(sender, encoding = 'ascii')
+    # Use subject
+    if subject:
+        xbody += bytes(subject, encoding = 'ascii')
+    # Use attachment hashes if present
+    if attachments:
+        for a in attachments:
+            xbody += bytes(a['hash'], encoding = 'ascii')
+    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
+    return mid
+
+
+# Old school way of making IDs
+def legacy(msg, body, lid, attachments):
+    mdate = email.utils.parsedate_tz(msg.get('date'))
+    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
+    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
+    return mid
+
+
+    


[4/4] incubator-ponymail git commit: update chglog

Posted by hu...@apache.org.
update chglog


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/c58b2312
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/c58b2312
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/c58b2312

Branch: refs/heads/master
Commit: c58b23127db123916ce169b6ee12990a71c6d66e
Parents: 23966d8
Author: Daniel Gruno <hu...@apache.org>
Authored: Sun Jun 4 15:59:49 2017 +0200
Committer: Daniel Gruno <hu...@apache.org>
Committed: Sun Jun 4 15:59:49 2017 +0200

----------------------------------------------------------------------
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/c58b2312/CHANGELOG.md
----------------------------------------------------------------------
diff --git a/CHANGELOG.md b/CHANGELOG.md
index be595de..bbc497f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
 ## CHANGES in 0.10:
+- ID generators have now been split into a separate library (generators.py)
 - more comprehensive ID generation mechanisms
 - private messages are now included in downloads if the user has access to them (#169, #108)
 - mbox export now generates valid From_ line (#190)