You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@ponymail.apache.org by sebb <se...@gmail.com> on 2020/08/14 14:55:18 UTC

Re: [incubator-ponymail] branch master updated: Pull main operation into main(), linting/PEP conformity fixes

This commit broke the functioning of the --generator option for the archiver.

Please fix.

On Mon, 10 Aug 2020 at 17:13, <hu...@apache.org> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
>
>
> The following commit(s) were added to refs/heads/master by this push:
>      new 95beb51  Pull main operation into main(), linting/PEP conformity fixes
> 95beb51 is described below
>
> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
> Author: Daniel Gruno <hu...@apache.org>
> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
>
>     Pull main operation into main(), linting/PEP conformity fixes
>
>     Also prepping for adding ES 7.x support.
>     import-mbox will need to be updated shortly
> ---
>  tools/archiver.py | 123 +++++++++++++++++++++++++-----------------------------
>  1 file changed, 58 insertions(+), 65 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index b97db76..2c50f19 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -62,23 +62,19 @@ import uuid
>  import json
>  import certifi
>  import urllib.parse
> +import argparse
> +import netaddr
>
>  # Fetch config
>  config = PonymailConfig()
>  auth = None
> -parseHTML = False
> -iBody = None  # N.B. Also used by import-mbox.py
> -args=None
> -dumpDir = None
> -aURL = None
>
>  if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
>      from zope.interface import implementer
>      from mailman.interfaces.archiver import IArchiver
>      from mailman.interfaces.archiver import ArchivePolicy
>      logger = logging.getLogger("mailman.archiver")
> -elif __name__ == '__main__':
> -    import argparse
> +
>
>  if config.has_option('archiver', 'baseurl'):
>      aURL = config.get('archiver', 'baseurl')
> @@ -102,8 +98,7 @@ def parse_attachment(part):
>          if cdtype == "attachment" or cdtype == 'inline':
>              fd = part.get_payload(decode=True)
>              # Allow for empty string
> -            if fd is None:
> -                return None, None
> +            if fd is None: return None, None
>              filename = part.get_filename()
>              if filename:
>                  print("Found attachment: %s" % filename)
> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>
>      """ Intercept index calls and fix up consistency argument """
>      def index(self, **kwargs):
> -        if ES_MAJOR in [5,6]:
> +        if ES_MAJOR in [5,6,7]:
>              if kwargs.pop('consistency', None): # drop the key if present
>                  if self.wait_for_active_shards: # replace with wait if defined
>                      kwargs['wait_for_active_shards'] = self.wait_for_active_shards
> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>              **kwargs
>          )
>
> -    def __init__(self, parseHTML=False):
> +    def __init__(self, generator='full', parse_html=False, dump_dir=None):
>          """ Just initialize ES. """
> -        self.html = parseHTML
> -        if parseHTML:
> +        self.html = parse_html
> +        self.generator = generator
> +        if parse_html:
>              import html2text
>              self.html2text = html2text.html2text
>          self.dbname = config.get("elasticsearch", "dbname")
> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
>          if ES_MAJOR == 2:
>              pass
> -        elif ES_MAJOR in [5,6]:
> +        elif ES_MAJOR in [5,6,7]:
>              self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
>          else:
>              raise Exception("Unexpected elasticsearch version ", ES_VERSION)
> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>              }
>              )
>          # If we have a dump dir, we can risk failing the connection.
> -        if dumpDir:
> +        if dump_dir:
>              try:
>                  self.es = Elasticsearch(dbs,
>                      max_retries=5,
>                      retry_on_timeout=True
>                      )
>              except:
> -                print("ES connection failed, but dumponfail specified, dumping to %s" % dumpDir)
> +                print("ES connection failed, but dumponfail specified, dumping to %s" % dump_dir)
>          else:
>              self.es = Elasticsearch(dbs,
>                  max_retries=5,
> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>                  contents[part_meta['hash']] = part_file
>          return attachments, contents
>
> -
> -    def msgbody(self, msg):
> +    def msgbody(self, msg, verbose=False, ignore_body=None):
>          body = None
>          firstHTML = None
>          for part in msg.walk():
>              # can be called from importer
> -            if args and args.verbose:
> +            if verbose:
>                  print("Content-Type: %s" % part.get_content_type())
>              """
>                  Find the first body part and the first HTML part
> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>                  if not body and part.get_content_type() == 'text/enriched':
>                      body = part.get_payload(decode=True)
>                  elif self.html and not firstHTML and part.get_content_type() == 'text/html':
> -                    firstHTML = part.get_payload(decode=True)
> +                    first_html = part.get_payload(decode=True)
>              except Exception as err:
>                  print(err)
>
>          # this requires a GPL lib, user will have to install it themselves
> -        if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)):
> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
>              body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
>
>          # See issue#463
> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          return body
>
>      # N.B. this is also called by import-mbox.py
> -    def compute_updates(self, lid, private, msg):
> +    def compute_updates(self, args, lid, private, msg):
>          """Determine what needs to be sent to the archiver.
>
>          :param lid: The list id
> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          # mdate calculations are all done, prepare the index entry
>          epoch = email.utils.mktime_tz(mdate)
>          mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
> -        body = self.msgbody(msg)
> +        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
>          try:
>              if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
>                  body = convertToWrapped(body, character_set="utf-8")
> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>              except Exception as err:
>                  if logger:
>                      # N.B. use .get just in case there is no message-id
> -                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id','?'))
> +                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id', '?'))
>                  mid = pmid
> +
>              if 'in-reply-to' in msg_metadata:
>                  try:
>                      try:
> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>
>          return  ojson, contents, msg_metadata, irt
>
> -    def archive_message(self, mlist, msg, raw_msg):
> +    def archive_message(self, args, mlist, msg, raw_message):
>          """Send the message to the archiver.
>
>          :param mlist: The IMailingList object.
> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
>              private = True
>
> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
>          if not ojson:
>              _id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
>              raise Exception("Could not parse message %s for %s" % (_id,lid))
> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>                  consistency = self.consistency,
>                  body = {
>                      "message-id": msg_metadata['message-id'],
> -                    "source": self.mbox_source(raw_msg)
> +                    "source": self.mbox_source(raw_message)
>                  }
>              )
>          # If we have a dump dir and ES failed, push to dump dir instead as a JSON object
>          # We'll leave it to another process to pick up the slack.
>          except Exception as err:
> -            if dumpDir:
> +            if args.dump:
>                  print("Pushing to ES failed, but dumponfail specified, dumping JSON docs")
>                  uid = uuid.uuid4()
> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
>                  with open(mboxPath, "w") as f:
>                      json.dump({
>                          'id': ojson['mid'],
>                          'mbox': ojson,
>                          'mbox_source': {
>                              "message-id": msg_metadata['message-id'],
> -                            "source": self.mbox_source(raw_msg)
> +                            "source": self.mbox_source(raw_message)
>                          },
>                          'attachments': contents
>                      },f , indent = 2)
> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>              if dm:
>                  cid = dm.group(1)
>                  mid = dm.group(2)
> -                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
> -                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
> +                if self.es.exists(index=self.dbname, doc_type='account', id=cid):
> +                    doc = self.es.get(index=self.dbname, doc_type='account', id=cid)
>                      if doc:
>                          oldrefs.append(cid)
>                          # N.B. no index is supplied, so ES will generate one
> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          """
>          return None
>
> -if __name__ == '__main__':
> +
> +def main():
>      parser = argparse.ArgumentParser(description='Command line options.')
>      parser.add_argument('--lid', dest='lid', type=str, nargs=1,
>                         help='Alternate specific list ID')
> @@ -593,16 +590,15 @@ if __name__ == '__main__':
>                         help='Try to convert HTML to text if no text/plain message is found')
>      parser.add_argument('--dry', dest='dry', action='store_true',
>                         help='Do not save emails to elasticsearch, only test parsing')
> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
> +                        help='Optional email bodies to treat as empty (in conjunction with --html2text)')
>      parser.add_argument('--dumponfail', dest='dump',
> -                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and fail silently.')
> +                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and '
> +                            'fail silently.')
>      parser.add_argument('--generator', dest='generator',
>                         help='Override the generator.')
>      args = parser.parse_args()
>
> -    if args.html2text:
> -        parseHTML = True
> -    if args.dump:
> -        dumpDir = args.dump
>      if args.verbose:
>          logging.basicConfig(stream=sys.stdout, level=logging.INFO)
>      else:
> @@ -610,39 +606,34 @@ if __name__ == '__main__':
>          # Also eliminates: 'Undecodable raw error response from server:' warning message
>          logging.getLogger("elasticsearch").setLevel(logging.ERROR)
>
> -    if args.generator:
> -        archiver_generator = args.generator
> -
> -    archie = Archiver(parseHTML = parseHTML)
> +    archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text)
>      # use binary input so parser can use appropriate charset
>      input_stream = sys.stdin.buffer
>
>      try:
> -        msgstring = input_stream.read()
> +        raw_message = input_stream.read()
>          try:
> -            msg = email.message_from_bytes(msgstring)
> +            msg = email.message_from_bytes(raw_message)
>          except Exception as err:
>              print("STDIN parser exception: %s" % err)
>
>          # We're reading from STDIN, so let's fake an MM3 call
>          ispublic = True
> -        ignorefrom = None
> -        allowfrom = None
>
>          if args.altheader:
> -            altheader = args.altheader[0]
> -            if altheader in msg:
> +            alt_header = args.altheader[0]
> +            if alt_header in msg:
>                  try:
> -                    msg.replace_header('List-ID', msg.get(altheader))
> +                    msg.replace_header('List-ID', msg.get(alt_header))
>                  except:
> -                    msg.add_header('list-id', msg.get(altheader))
> +                    msg.add_header('list-id', msg.get(alt_header))
>          elif 'altheader' in sys.argv:
> -            altheader = sys.argv[len(sys.argv)-1]
> -            if altheader in msg:
> +            alt_header = sys.argv[len(sys.argv)-1]
> +            if alt_header in msg:
>                  try:
> -                    msg.replace_header('List-ID', msg.get(altheader))
> +                    msg.replace_header('List-ID', msg.get(alt_header))
>                  except:
> -                    msg.add_header('list-id', msg.get(altheader))
> +                    msg.add_header('list-id', msg.get(alt_header))
>
>          # Set specific LID?
>          if args.lid and len(args.lid[0]) > 3:
> @@ -654,21 +645,21 @@ if __name__ == '__main__':
>
>          #Ignore based on --ignore flag?
>          if args.ignorefrom:
> -            ignorefrom = args.ignorefrom[0]
> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
> +            ignore_from = args.ignorefrom[0]
> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
>                  print("Ignoring message as instructed by --ignore flag")
>                  sys.exit(0)
>
>          # Check CIDR if need be
>          if args.allowfrom:
> -            from netaddr import IPNetwork, IPAddress
> -            c = IPNetwork(args.allowfrom[0])
> +
> +            c = netaddr.IPNetwork(args.allowfrom[0])
>              good = False
>              for line in msg.get_all('received') or []:
>                  m = re.search(r"from .+\[(.+)\]", line)
>                  if m:
>                      try:
> -                        ip = IPAddress(m.group(1))
> +                        ip = netaddr.IPAddress(m.group(1))
>                          if ip in c:
>                              good = True
>                              msg.add_header("ip-whitelisted", "yes")
> @@ -681,19 +672,18 @@ if __name__ == '__main__':
>          # Replace date header with $now?
>          if args.makedate:
>              msg.replace_header('date', email.utils.formatdate())
> -
> +        is_public = True
>          if args.private:
> -            ispublic = False
> +            is_public = False
>          if 'list-id' in msg:
>              if not msg.get('archived-at'):
>                  msg.add_header('archived-at', email.utils.formatdate())
> -            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
> +            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id=msg.get('list-id'),
> +                                                                               archive_public=is_public)
>
>              try:
> -                lid, mid = archie.archive_message(list_data, msg, msgstring)
> +                lid, mid = archie.archive_message(args, list_data, msg, raw_message)
>                  print("%s: Done archiving to %s as %s!" % (email.utils.formatdate(), lid, mid))
> -                if aURL:
> -                    print("Published as: %s/thread.html/%s" % (aURL, urllib.parse.quote(mid)))
>              except Exception as err:
>                  if args.verbose:
>                      traceback.print_exc()
> @@ -711,3 +701,6 @@ if __name__ == '__main__':
>              print("Could not parse email: %s (@ %s)" % (err, line))
>              sys.exit(-1)
>
> +
> +if __name__ == '__main__':
> +    main()
>

Re: [incubator-ponymail] branch master updated: Pull main operation into main(), linting/PEP conformity fixes

Posted by sebb <se...@gmail.com>.

On Sat, 15 Aug 2020 at 14:14, Daniel Gruno <hu...@apache.org> wrote:
>
> On 15/08/2020 14.50, sebb wrote:
> > There was another incompatible change:
> >
> > The Archiver __init__ method keyword parameter parse_html was renamed
> > to parseHtml.
> > This breaks compatibility with previous versions, making testing of
> > older versions much harder.
> > It's probably OK to add new keyword parameters, but please don't make
> > arbitrary changes to existing parameters.
>
> I think it's the other way around - parseHTML was renamed to parse_html.

Oops, yes it was the other way round.

> It wasn't completely arbitrary, it was to conform with PEP8 guidelines
> and generally harmonizing the way we type arguments in PM.
> I can get the test suite to work around this, not a problem.
> I'd much rather have a version-aware unit tester than having to stick to

OK, but it will need ongoing maintenance.

> the same parameters for all eternity, as that may hinder future
> development. One example being the DKIM PR where the raw message body
> now needs to get passed through (which arguably is much better than
> relying on .as_bytes()).
>
> >
> > On Fri, 14 Aug 2020 at 18:47, Daniel Gruno <hu...@apache.org> wrote:
> >>
> >> Apoologies, should be in working condition again now.
> >>
> >> On 14/08/2020 16.55, sebb wrote:
> >>> This commit broke the functioning of the --generator option for the archiver.
> >>>
> >>> Please fix.
> >>>
> >>> On Mon, 10 Aug 2020 at 17:13, <hu...@apache.org> wrote:
> >>>>
> >>>> This is an automated email from the ASF dual-hosted git repository.
> >>>>
> >>>> humbedooh pushed a commit to branch master
> >>>> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
> >>>>
> >>>>
> >>>> The following commit(s) were added to refs/heads/master by this push:
> >>>>        new 95beb51  Pull main operation into main(), linting/PEP conformity fixes
> >>>> 95beb51 is described below
> >>>>
> >>>> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
> >>>> Author: Daniel Gruno <hu...@apache.org>
> >>>> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
> >>>>
> >>>>       Pull main operation into main(), linting/PEP conformity fixes
> >>>>
> >>>>       Also prepping for adding ES 7.x support.
> >>>>       import-mbox will need to be updated shortly
> >>>> ---
> >>>>    tools/archiver.py | 123 +++++++++++++++++++++++++-----------------------------
> >>>>    1 file changed, 58 insertions(+), 65 deletions(-)
> >>>>
> >>>> diff --git a/tools/archiver.py b/tools/archiver.py
> >>>> index b97db76..2c50f19 100755
> >>>> --- a/tools/archiver.py
> >>>> +++ b/tools/archiver.py
> >>>> @@ -62,23 +62,19 @@ import uuid
> >>>>    import json
> >>>>    import certifi
> >>>>    import urllib.parse
> >>>> +import argparse
> >>>> +import netaddr
> >>>>
> >>>>    # Fetch config
> >>>>    config = PonymailConfig()
> >>>>    auth = None
> >>>> -parseHTML = False
> >>>> -iBody = None  # N.B. Also used by import-mbox.py
> >>>> -args=None
> >>>> -dumpDir = None
> >>>> -aURL = None
> >>>>
> >>>>    if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
> >>>>        from zope.interface import implementer
> >>>>        from mailman.interfaces.archiver import IArchiver
> >>>>        from mailman.interfaces.archiver import ArchivePolicy
> >>>>        logger = logging.getLogger("mailman.archiver")
> >>>> -elif __name__ == '__main__':
> >>>> -    import argparse
> >>>> +
> >>>>
> >>>>    if config.has_option('archiver', 'baseurl'):
> >>>>        aURL = config.get('archiver', 'baseurl')
> >>>> @@ -102,8 +98,7 @@ def parse_attachment(part):
> >>>>            if cdtype == "attachment" or cdtype == 'inline':
> >>>>                fd = part.get_payload(decode=True)
> >>>>                # Allow for empty string
> >>>> -            if fd is None:
> >>>> -                return None, None
> >>>> +            if fd is None: return None, None
> >>>>                filename = part.get_filename()
> >>>>                if filename:
> >>>>                    print("Found attachment: %s" % filename)
> >>>> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>
> >>>>        """ Intercept index calls and fix up consistency argument """
> >>>>        def index(self, **kwargs):
> >>>> -        if ES_MAJOR in [5,6]:
> >>>> +        if ES_MAJOR in [5,6,7]:
> >>>>                if kwargs.pop('consistency', None): # drop the key if present
> >>>>                    if self.wait_for_active_shards: # replace with wait if defined
> >>>>                        kwargs['wait_for_active_shards'] = self.wait_for_active_shards
> >>>> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                **kwargs
> >>>>            )
> >>>>
> >>>> -    def __init__(self, parseHTML=False):
> >>>> +    def __init__(self, generator='full', parse_html=False, dump_dir=None):
> >>>>            """ Just initialize ES. """
> >>>> -        self.html = parseHTML
> >>>> -        if parseHTML:
> >>>> +        self.html = parse_html
> >>>> +        self.generator = generator
> >>>> +        if parse_html:
> >>>>                import html2text
> >>>>                self.html2text = html2text.html2text
> >>>>            self.dbname = config.get("elasticsearch", "dbname")
> >>>> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>            self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
> >>>>            if ES_MAJOR == 2:
> >>>>                pass
> >>>> -        elif ES_MAJOR in [5,6]:
> >>>> +        elif ES_MAJOR in [5,6,7]:
> >>>>                self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
> >>>>            else:
> >>>>                raise Exception("Unexpected elasticsearch version ", ES_VERSION)
> >>>> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                }
> >>>>                )
> >>>>            # If we have a dump dir, we can risk failing the connection.
> >>>> -        if dumpDir:
> >>>> +        if dump_dir:
> >>>>                try:
> >>>>                    self.es = Elasticsearch(dbs,
> >>>>                        max_retries=5,
> >>>>                        retry_on_timeout=True
> >>>>                        )
> >>>>                except:
> >>>> -                print("ES connection failed, but dumponfail specified, dumping to %s" % dumpDir)
> >>>> +                print("ES connection failed, but dumponfail specified, dumping to %s" % dump_dir)
> >>>>            else:
> >>>>                self.es = Elasticsearch(dbs,
> >>>>                    max_retries=5,
> >>>> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                    contents[part_meta['hash']] = part_file
> >>>>            return attachments, contents
> >>>>
> >>>> -
> >>>> -    def msgbody(self, msg):
> >>>> +    def msgbody(self, msg, verbose=False, ignore_body=None):
> >>>>            body = None
> >>>>            firstHTML = None
> >>>>            for part in msg.walk():
> >>>>                # can be called from importer
> >>>> -            if args and args.verbose:
> >>>> +            if verbose:
> >>>>                    print("Content-Type: %s" % part.get_content_type())
> >>>>                """
> >>>>                    Find the first body part and the first HTML part
> >>>> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                    if not body and part.get_content_type() == 'text/enriched':
> >>>>                        body = part.get_payload(decode=True)
> >>>>                    elif self.html and not firstHTML and part.get_content_type() == 'text/html':
> >>>> -                    firstHTML = part.get_payload(decode=True)
> >>>> +                    first_html = part.get_payload(decode=True)
> >>>>                except Exception as err:
> >>>>                    print(err)
> >>>>
> >>>>            # this requires a GPL lib, user will have to install it themselves
> >>>> -        if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)):
> >>>> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
> >>>>                body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
> >>>>
> >>>>            # See issue#463
> >>>> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>            return body
> >>>>
> >>>>        # N.B. this is also called by import-mbox.py
> >>>> -    def compute_updates(self, lid, private, msg):
> >>>> +    def compute_updates(self, args, lid, private, msg):
> >>>>            """Determine what needs to be sent to the archiver.
> >>>>
> >>>>            :param lid: The list id
> >>>> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>            # mdate calculations are all done, prepare the index entry
> >>>>            epoch = email.utils.mktime_tz(mdate)
> >>>>            mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
> >>>> -        body = self.msgbody(msg)
> >>>> +        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
> >>>>            try:
> >>>>                if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
> >>>>                    body = convertToWrapped(body, character_set="utf-8")
> >>>> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                except Exception as err:
> >>>>                    if logger:
> >>>>                        # N.B. use .get just in case there is no message-id
> >>>> -                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id','?'))
> >>>> +                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id', '?'))
> >>>>                    mid = pmid
> >>>> +
> >>>>                if 'in-reply-to' in msg_metadata:
> >>>>                    try:
> >>>>                        try:
> >>>> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>
> >>>>            return  ojson, contents, msg_metadata, irt
> >>>>
> >>>> -    def archive_message(self, mlist, msg, raw_msg):
> >>>> +    def archive_message(self, args, mlist, msg, raw_message):
> >>>>            """Send the message to the archiver.
> >>>>
> >>>>            :param mlist: The IMailingList object.
> >>>> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>            elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
> >>>>                private = True
> >>>>
> >>>> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
> >>>> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
> >>>>            if not ojson:
> >>>>                _id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
> >>>>                raise Exception("Could not parse message %s for %s" % (_id,lid))
> >>>> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                    consistency = self.consistency,
> >>>>                    body = {
> >>>>                        "message-id": msg_metadata['message-id'],
> >>>> -                    "source": self.mbox_source(raw_msg)
> >>>> +                    "source": self.mbox_source(raw_message)
> >>>>                    }
> >>>>                )
> >>>>            # If we have a dump dir and ES failed, push to dump dir instead as a JSON object
> >>>>            # We'll leave it to another process to pick up the slack.
> >>>>            except Exception as err:
> >>>> -            if dumpDir:
> >>>> +            if args.dump:
> >>>>                    print("Pushing to ES failed, but dumponfail specified, dumping JSON docs")
> >>>>                    uid = uuid.uuid4()
> >>>> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
> >>>> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
> >>>>                    with open(mboxPath, "w") as f:
> >>>>                        json.dump({
> >>>>                            'id': ojson['mid'],
> >>>>                            'mbox': ojson,
> >>>>                            'mbox_source': {
> >>>>                                "message-id": msg_metadata['message-id'],
> >>>> -                            "source": self.mbox_source(raw_msg)
> >>>> +                            "source": self.mbox_source(raw_message)
> >>>>                            },
> >>>>                            'attachments': contents
> >>>>                        },f , indent = 2)
> >>>> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>                if dm:
> >>>>                    cid = dm.group(1)
> >>>>                    mid = dm.group(2)
> >>>> -                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
> >>>> -                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
> >>>> +                if self.es.exists(index=self.dbname, doc_type='account', id=cid):
> >>>> +                    doc = self.es.get(index=self.dbname, doc_type='account', id=cid)
> >>>>                        if doc:
> >>>>                            oldrefs.append(cid)
> >>>>                            # N.B. no index is supplied, so ES will generate one
> >>>> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>>>            """
> >>>>            return None
> >>>>
> >>>> -if __name__ == '__main__':
> >>>> +
> >>>> +def main():
> >>>>        parser = argparse.ArgumentParser(description='Command line options.')
> >>>>        parser.add_argument('--lid', dest='lid', type=str, nargs=1,
> >>>>                           help='Alternate specific list ID')
> >>>> @@ -593,16 +590,15 @@ if __name__ == '__main__':
> >>>>                           help='Try to convert HTML to text if no text/plain message is found')
> >>>>        parser.add_argument('--dry', dest='dry', action='store_true',
> >>>>                           help='Do not save emails to elasticsearch, only test parsing')
> >>>> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
> >>>> +                        help='Optional email bodies to treat as empty (in conjunction with --html2text)')
> >>>>        parser.add_argument('--dumponfail', dest='dump',
> >>>> -                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and fail silently.')
> >>>> +                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and '
> >>>> +                            'fail silently.')
> >>>>        parser.add_argument('--generator', dest='generator',
> >>>>                           help='Override the generator.')
> >>>>        args = parser.parse_args()
> >>>>
> >>>> -    if args.html2text:
> >>>> -        parseHTML = True
> >>>> -    if args.dump:
> >>>> -        dumpDir = args.dump
> >>>>        if args.verbose:
> >>>>            logging.basicConfig(stream=sys.stdout, level=logging.INFO)
> >>>>        else:
> >>>> @@ -610,39 +606,34 @@ if __name__ == '__main__':
> >>>>            # Also eliminates: 'Undecodable raw error response from server:' warning message
> >>>>            logging.getLogger("elasticsearch").setLevel(logging.ERROR)
> >>>>
> >>>> -    if args.generator:
> >>>> -        archiver_generator = args.generator
> >>>> -
> >>>> -    archie = Archiver(parseHTML = parseHTML)
> >>>> +    archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text)
> >>>>        # use binary input so parser can use appropriate charset
> >>>>        input_stream = sys.stdin.buffer
> >>>>
> >>>>        try:
> >>>> -        msgstring = input_stream.read()
> >>>> +        raw_message = input_stream.read()
> >>>>            try:
> >>>> -            msg = email.message_from_bytes(msgstring)
> >>>> +            msg = email.message_from_bytes(raw_message)
> >>>>            except Exception as err:
> >>>>                print("STDIN parser exception: %s" % err)
> >>>>
> >>>>            # We're reading from STDIN, so let's fake an MM3 call
> >>>>            ispublic = True
> >>>> -        ignorefrom = None
> >>>> -        allowfrom = None
> >>>>
> >>>>            if args.altheader:
> >>>> -            altheader = args.altheader[0]
> >>>> -            if altheader in msg:
> >>>> +            alt_header = args.altheader[0]
> >>>> +            if alt_header in msg:
> >>>>                    try:
> >>>> -                    msg.replace_header('List-ID', msg.get(altheader))
> >>>> +                    msg.replace_header('List-ID', msg.get(alt_header))
> >>>>                    except:
> >>>> -                    msg.add_header('list-id', msg.get(altheader))
> >>>> +                    msg.add_header('list-id', msg.get(alt_header))
> >>>>            elif 'altheader' in sys.argv:
> >>>> -            altheader = sys.argv[len(sys.argv)-1]
> >>>> -            if altheader in msg:
> >>>> +            alt_header = sys.argv[len(sys.argv)-1]
> >>>> +            if alt_header in msg:
> >>>>                    try:
> >>>> -                    msg.replace_header('List-ID', msg.get(altheader))
> >>>> +                    msg.replace_header('List-ID', msg.get(alt_header))
> >>>>                    except:
> >>>> -                    msg.add_header('list-id', msg.get(altheader))
> >>>> +                    msg.add_header('list-id', msg.get(alt_header))
> >>>>
> >>>>            # Set specific LID?
> >>>>            if args.lid and len(args.lid[0]) > 3:
> >>>> @@ -654,21 +645,21 @@ if __name__ == '__main__':
> >>>>
> >>>>            #Ignore based on --ignore flag?
> >>>>            if args.ignorefrom:
> >>>> -            ignorefrom = args.ignorefrom[0]
> >>>> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
> >>>> +            ignore_from = args.ignorefrom[0]
> >>>> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
> >>>>                    print("Ignoring message as instructed by --ignore flag")
> >>>>                    sys.exit(0)
> >>>>
> >>>>            # Check CIDR if need be
> >>>>            if args.allowfrom:
> >>>> -            from netaddr import IPNetwork, IPAddress
> >>>> -            c = IPNetwork(args.allowfrom[0])
> >>>> +
> >>>> +            c = netaddr.IPNetwork(args.allowfrom[0])
> >>>>                good = False
> >>>>                for line in msg.get_all('received') or []:
> >>>>                    m = re.search(r"from .+\[(.+)\]", line)
> >>>>                    if m:
> >>>>                        try:
> >>>> -                        ip = IPAddress(m.group(1))
> >>>> +                        ip = netaddr.IPAddress(m.group(1))
> >>>>                            if ip in c:
> >>>>                                good = True
> >>>>                                msg.add_header("ip-whitelisted", "yes")
> >>>> @@ -681,19 +672,18 @@ if __name__ == '__main__':
> >>>>            # Replace date header with $now?
> >>>>            if args.makedate:
> >>>>                msg.replace_header('date', email.utils.formatdate())
> >>>> -
> >>>> +        is_public = True
> >>>>            if args.private:
> >>>> -            ispublic = False
> >>>> +            is_public = False
> >>>>            if 'list-id' in msg:
> >>>>                if not msg.get('archived-at'):
> >>>>                    msg.add_header('archived-at', email.utils.formatdate())
> >>>> -            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
> >>>> +            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id=msg.get('list-id'),
> >>>> +                                                                               archive_public=is_public)
> >>>>
> >>>>                try:
> >>>> -                lid, mid = archie.archive_message(list_data, msg, msgstring)
> >>>> +                lid, mid = archie.archive_message(args, list_data, msg, raw_message)
> >>>>                    print("%s: Done archiving to %s as %s!" % (email.utils.formatdate(), lid, mid))
> >>>> -                if aURL:
> >>>> -                    print("Published as: %s/thread.html/%s" % (aURL, urllib.parse.quote(mid)))
> >>>>                except Exception as err:
> >>>>                    if args.verbose:
> >>>>                        traceback.print_exc()
> >>>> @@ -711,3 +701,6 @@ if __name__ == '__main__':
> >>>>                print("Could not parse email: %s (@ %s)" % (err, line))
> >>>>                sys.exit(-1)
> >>>>
> >>>> +
> >>>> +if __name__ == '__main__':
> >>>> +    main()
> >>>>
> >>
>

Re: [incubator-ponymail] branch master updated: Pull main operation into main(), linting/PEP conformity fixes

Posted by Daniel Gruno <hu...@apache.org>.

On 15/08/2020 14.50, sebb wrote:
> There was another incompatible change:
> 
> The Archiver __init__ method keyword parameter parse_html was renamed
> to parseHtml.
> This breaks compatibility with previous versions, making testing of
> older versions much harder.
> It's probably OK to add new keyword parameters, but please don't make
> arbitrary changes to existing parameters.

I think it's the other way around - parseHTML was renamed to parse_html.
It wasn't completely arbitrary, it was to conform with PEP8 guidelines 
and generally harmonizing the way we type arguments in PM.

I can get the test suite to work around this, not a problem.
I'd much rather have a version-aware unit tester than having to stick to 
the same parameters for all eternity, as that may hinder future 
development. One example being the DKIM PR where the raw message body 
now needs to get passed through (which arguably is much better than 
relying on .as_bytes()).

> 
> On Fri, 14 Aug 2020 at 18:47, Daniel Gruno <hu...@apache.org> wrote:
>>
>> Apoologies, should be in working condition again now.
>>
>> On 14/08/2020 16.55, sebb wrote:
>>> This commit broke the functioning of the --generator option for the archiver.
>>>
>>> Please fix.
>>>
>>> On Mon, 10 Aug 2020 at 17:13, <hu...@apache.org> wrote:
>>>>
>>>> This is an automated email from the ASF dual-hosted git repository.
>>>>
>>>> humbedooh pushed a commit to branch master
>>>> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
>>>>
>>>>
>>>> The following commit(s) were added to refs/heads/master by this push:
>>>>        new 95beb51  Pull main operation into main(), linting/PEP conformity fixes
>>>> 95beb51 is described below
>>>>
>>>> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
>>>> Author: Daniel Gruno <hu...@apache.org>
>>>> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
>>>>
>>>>       Pull main operation into main(), linting/PEP conformity fixes
>>>>
>>>>       Also prepping for adding ES 7.x support.
>>>>       import-mbox will need to be updated shortly
>>>> ---
>>>>    tools/archiver.py | 123 +++++++++++++++++++++++++-----------------------------
>>>>    1 file changed, 58 insertions(+), 65 deletions(-)
>>>>
>>>> diff --git a/tools/archiver.py b/tools/archiver.py
>>>> index b97db76..2c50f19 100755
>>>> --- a/tools/archiver.py
>>>> +++ b/tools/archiver.py
>>>> @@ -62,23 +62,19 @@ import uuid
>>>>    import json
>>>>    import certifi
>>>>    import urllib.parse
>>>> +import argparse
>>>> +import netaddr
>>>>
>>>>    # Fetch config
>>>>    config = PonymailConfig()
>>>>    auth = None
>>>> -parseHTML = False
>>>> -iBody = None  # N.B. Also used by import-mbox.py
>>>> -args=None
>>>> -dumpDir = None
>>>> -aURL = None
>>>>
>>>>    if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
>>>>        from zope.interface import implementer
>>>>        from mailman.interfaces.archiver import IArchiver
>>>>        from mailman.interfaces.archiver import ArchivePolicy
>>>>        logger = logging.getLogger("mailman.archiver")
>>>> -elif __name__ == '__main__':
>>>> -    import argparse
>>>> +
>>>>
>>>>    if config.has_option('archiver', 'baseurl'):
>>>>        aURL = config.get('archiver', 'baseurl')
>>>> @@ -102,8 +98,7 @@ def parse_attachment(part):
>>>>            if cdtype == "attachment" or cdtype == 'inline':
>>>>                fd = part.get_payload(decode=True)
>>>>                # Allow for empty string
>>>> -            if fd is None:
>>>> -                return None, None
>>>> +            if fd is None: return None, None
>>>>                filename = part.get_filename()
>>>>                if filename:
>>>>                    print("Found attachment: %s" % filename)
>>>> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>
>>>>        """ Intercept index calls and fix up consistency argument """
>>>>        def index(self, **kwargs):
>>>> -        if ES_MAJOR in [5,6]:
>>>> +        if ES_MAJOR in [5,6,7]:
>>>>                if kwargs.pop('consistency', None): # drop the key if present
>>>>                    if self.wait_for_active_shards: # replace with wait if defined
>>>>                        kwargs['wait_for_active_shards'] = self.wait_for_active_shards
>>>> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                **kwargs
>>>>            )
>>>>
>>>> -    def __init__(self, parseHTML=False):
>>>> +    def __init__(self, generator='full', parse_html=False, dump_dir=None):
>>>>            """ Just initialize ES. """
>>>> -        self.html = parseHTML
>>>> -        if parseHTML:
>>>> +        self.html = parse_html
>>>> +        self.generator = generator
>>>> +        if parse_html:
>>>>                import html2text
>>>>                self.html2text = html2text.html2text
>>>>            self.dbname = config.get("elasticsearch", "dbname")
>>>> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>            self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
>>>>            if ES_MAJOR == 2:
>>>>                pass
>>>> -        elif ES_MAJOR in [5,6]:
>>>> +        elif ES_MAJOR in [5,6,7]:
>>>>                self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
>>>>            else:
>>>>                raise Exception("Unexpected elasticsearch version ", ES_VERSION)
>>>> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                }
>>>>                )
>>>>            # If we have a dump dir, we can risk failing the connection.
>>>> -        if dumpDir:
>>>> +        if dump_dir:
>>>>                try:
>>>>                    self.es = Elasticsearch(dbs,
>>>>                        max_retries=5,
>>>>                        retry_on_timeout=True
>>>>                        )
>>>>                except:
>>>> -                print("ES connection failed, but dumponfail specified, dumping to %s" % dumpDir)
>>>> +                print("ES connection failed, but dumponfail specified, dumping to %s" % dump_dir)
>>>>            else:
>>>>                self.es = Elasticsearch(dbs,
>>>>                    max_retries=5,
>>>> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                    contents[part_meta['hash']] = part_file
>>>>            return attachments, contents
>>>>
>>>> -
>>>> -    def msgbody(self, msg):
>>>> +    def msgbody(self, msg, verbose=False, ignore_body=None):
>>>>            body = None
>>>>            firstHTML = None
>>>>            for part in msg.walk():
>>>>                # can be called from importer
>>>> -            if args and args.verbose:
>>>> +            if verbose:
>>>>                    print("Content-Type: %s" % part.get_content_type())
>>>>                """
>>>>                    Find the first body part and the first HTML part
>>>> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                    if not body and part.get_content_type() == 'text/enriched':
>>>>                        body = part.get_payload(decode=True)
>>>>                    elif self.html and not firstHTML and part.get_content_type() == 'text/html':
>>>> -                    firstHTML = part.get_payload(decode=True)
>>>> +                    first_html = part.get_payload(decode=True)
>>>>                except Exception as err:
>>>>                    print(err)
>>>>
>>>>            # this requires a GPL lib, user will have to install it themselves
>>>> -        if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)):
>>>> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
>>>>                body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
>>>>
>>>>            # See issue#463
>>>> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>            return body
>>>>
>>>>        # N.B. this is also called by import-mbox.py
>>>> -    def compute_updates(self, lid, private, msg):
>>>> +    def compute_updates(self, args, lid, private, msg):
>>>>            """Determine what needs to be sent to the archiver.
>>>>
>>>>            :param lid: The list id
>>>> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>            # mdate calculations are all done, prepare the index entry
>>>>            epoch = email.utils.mktime_tz(mdate)
>>>>            mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
>>>> -        body = self.msgbody(msg)
>>>> +        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
>>>>            try:
>>>>                if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
>>>>                    body = convertToWrapped(body, character_set="utf-8")
>>>> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                except Exception as err:
>>>>                    if logger:
>>>>                        # N.B. use .get just in case there is no message-id
>>>> -                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id','?'))
>>>> +                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id', '?'))
>>>>                    mid = pmid
>>>> +
>>>>                if 'in-reply-to' in msg_metadata:
>>>>                    try:
>>>>                        try:
>>>> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>
>>>>            return  ojson, contents, msg_metadata, irt
>>>>
>>>> -    def archive_message(self, mlist, msg, raw_msg):
>>>> +    def archive_message(self, args, mlist, msg, raw_message):
>>>>            """Send the message to the archiver.
>>>>
>>>>            :param mlist: The IMailingList object.
>>>> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>            elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
>>>>                private = True
>>>>
>>>> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
>>>> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
>>>>            if not ojson:
>>>>                _id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
>>>>                raise Exception("Could not parse message %s for %s" % (_id,lid))
>>>> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                    consistency = self.consistency,
>>>>                    body = {
>>>>                        "message-id": msg_metadata['message-id'],
>>>> -                    "source": self.mbox_source(raw_msg)
>>>> +                    "source": self.mbox_source(raw_message)
>>>>                    }
>>>>                )
>>>>            # If we have a dump dir and ES failed, push to dump dir instead as a JSON object
>>>>            # We'll leave it to another process to pick up the slack.
>>>>            except Exception as err:
>>>> -            if dumpDir:
>>>> +            if args.dump:
>>>>                    print("Pushing to ES failed, but dumponfail specified, dumping JSON docs")
>>>>                    uid = uuid.uuid4()
>>>> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
>>>> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
>>>>                    with open(mboxPath, "w") as f:
>>>>                        json.dump({
>>>>                            'id': ojson['mid'],
>>>>                            'mbox': ojson,
>>>>                            'mbox_source': {
>>>>                                "message-id": msg_metadata['message-id'],
>>>> -                            "source": self.mbox_source(raw_msg)
>>>> +                            "source": self.mbox_source(raw_message)
>>>>                            },
>>>>                            'attachments': contents
>>>>                        },f , indent = 2)
>>>> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>                if dm:
>>>>                    cid = dm.group(1)
>>>>                    mid = dm.group(2)
>>>> -                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
>>>> -                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
>>>> +                if self.es.exists(index=self.dbname, doc_type='account', id=cid):
>>>> +                    doc = self.es.get(index=self.dbname, doc_type='account', id=cid)
>>>>                        if doc:
>>>>                            oldrefs.append(cid)
>>>>                            # N.B. no index is supplied, so ES will generate one
>>>> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>>>            """
>>>>            return None
>>>>
>>>> -if __name__ == '__main__':
>>>> +
>>>> +def main():
>>>>        parser = argparse.ArgumentParser(description='Command line options.')
>>>>        parser.add_argument('--lid', dest='lid', type=str, nargs=1,
>>>>                           help='Alternate specific list ID')
>>>> @@ -593,16 +590,15 @@ if __name__ == '__main__':
>>>>                           help='Try to convert HTML to text if no text/plain message is found')
>>>>        parser.add_argument('--dry', dest='dry', action='store_true',
>>>>                           help='Do not save emails to elasticsearch, only test parsing')
>>>> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
>>>> +                        help='Optional email bodies to treat as empty (in conjunction with --html2text)')
>>>>        parser.add_argument('--dumponfail', dest='dump',
>>>> -                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and fail silently.')
>>>> +                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and '
>>>> +                            'fail silently.')
>>>>        parser.add_argument('--generator', dest='generator',
>>>>                           help='Override the generator.')
>>>>        args = parser.parse_args()
>>>>
>>>> -    if args.html2text:
>>>> -        parseHTML = True
>>>> -    if args.dump:
>>>> -        dumpDir = args.dump
>>>>        if args.verbose:
>>>>            logging.basicConfig(stream=sys.stdout, level=logging.INFO)
>>>>        else:
>>>> @@ -610,39 +606,34 @@ if __name__ == '__main__':
>>>>            # Also eliminates: 'Undecodable raw error response from server:' warning message
>>>>            logging.getLogger("elasticsearch").setLevel(logging.ERROR)
>>>>
>>>> -    if args.generator:
>>>> -        archiver_generator = args.generator
>>>> -
>>>> -    archie = Archiver(parseHTML = parseHTML)
>>>> +    archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text)
>>>>        # use binary input so parser can use appropriate charset
>>>>        input_stream = sys.stdin.buffer
>>>>
>>>>        try:
>>>> -        msgstring = input_stream.read()
>>>> +        raw_message = input_stream.read()
>>>>            try:
>>>> -            msg = email.message_from_bytes(msgstring)
>>>> +            msg = email.message_from_bytes(raw_message)
>>>>            except Exception as err:
>>>>                print("STDIN parser exception: %s" % err)
>>>>
>>>>            # We're reading from STDIN, so let's fake an MM3 call
>>>>            ispublic = True
>>>> -        ignorefrom = None
>>>> -        allowfrom = None
>>>>
>>>>            if args.altheader:
>>>> -            altheader = args.altheader[0]
>>>> -            if altheader in msg:
>>>> +            alt_header = args.altheader[0]
>>>> +            if alt_header in msg:
>>>>                    try:
>>>> -                    msg.replace_header('List-ID', msg.get(altheader))
>>>> +                    msg.replace_header('List-ID', msg.get(alt_header))
>>>>                    except:
>>>> -                    msg.add_header('list-id', msg.get(altheader))
>>>> +                    msg.add_header('list-id', msg.get(alt_header))
>>>>            elif 'altheader' in sys.argv:
>>>> -            altheader = sys.argv[len(sys.argv)-1]
>>>> -            if altheader in msg:
>>>> +            alt_header = sys.argv[len(sys.argv)-1]
>>>> +            if alt_header in msg:
>>>>                    try:
>>>> -                    msg.replace_header('List-ID', msg.get(altheader))
>>>> +                    msg.replace_header('List-ID', msg.get(alt_header))
>>>>                    except:
>>>> -                    msg.add_header('list-id', msg.get(altheader))
>>>> +                    msg.add_header('list-id', msg.get(alt_header))
>>>>
>>>>            # Set specific LID?
>>>>            if args.lid and len(args.lid[0]) > 3:
>>>> @@ -654,21 +645,21 @@ if __name__ == '__main__':
>>>>
>>>>            #Ignore based on --ignore flag?
>>>>            if args.ignorefrom:
>>>> -            ignorefrom = args.ignorefrom[0]
>>>> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
>>>> +            ignore_from = args.ignorefrom[0]
>>>> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
>>>>                    print("Ignoring message as instructed by --ignore flag")
>>>>                    sys.exit(0)
>>>>
>>>>            # Check CIDR if need be
>>>>            if args.allowfrom:
>>>> -            from netaddr import IPNetwork, IPAddress
>>>> -            c = IPNetwork(args.allowfrom[0])
>>>> +
>>>> +            c = netaddr.IPNetwork(args.allowfrom[0])
>>>>                good = False
>>>>                for line in msg.get_all('received') or []:
>>>>                    m = re.search(r"from .+\[(.+)\]", line)
>>>>                    if m:
>>>>                        try:
>>>> -                        ip = IPAddress(m.group(1))
>>>> +                        ip = netaddr.IPAddress(m.group(1))
>>>>                            if ip in c:
>>>>                                good = True
>>>>                                msg.add_header("ip-whitelisted", "yes")
>>>> @@ -681,19 +672,18 @@ if __name__ == '__main__':
>>>>            # Replace date header with $now?
>>>>            if args.makedate:
>>>>                msg.replace_header('date', email.utils.formatdate())
>>>> -
>>>> +        is_public = True
>>>>            if args.private:
>>>> -            ispublic = False
>>>> +            is_public = False
>>>>            if 'list-id' in msg:
>>>>                if not msg.get('archived-at'):
>>>>                    msg.add_header('archived-at', email.utils.formatdate())
>>>> -            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
>>>> +            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id=msg.get('list-id'),
>>>> +                                                                               archive_public=is_public)
>>>>
>>>>                try:
>>>> -                lid, mid = archie.archive_message(list_data, msg, msgstring)
>>>> +                lid, mid = archie.archive_message(args, list_data, msg, raw_message)
>>>>                    print("%s: Done archiving to %s as %s!" % (email.utils.formatdate(), lid, mid))
>>>> -                if aURL:
>>>> -                    print("Published as: %s/thread.html/%s" % (aURL, urllib.parse.quote(mid)))
>>>>                except Exception as err:
>>>>                    if args.verbose:
>>>>                        traceback.print_exc()
>>>> @@ -711,3 +701,6 @@ if __name__ == '__main__':
>>>>                print("Could not parse email: %s (@ %s)" % (err, line))
>>>>                sys.exit(-1)
>>>>
>>>> +
>>>> +if __name__ == '__main__':
>>>> +    main()
>>>>
>>

Re: [incubator-ponymail] branch master updated: Pull main operation into main(), linting/PEP conformity fixes

Posted by sebb <se...@gmail.com>.

There was another incompatible change:

The Archiver __init__ method keyword parameter parse_html was renamed
to parseHtml.
This breaks compatibility with previous versions, making testing of
older versions much harder.
It's probably OK to add new keyword parameters, but please don't make
arbitrary changes to existing parameters.

On Fri, 14 Aug 2020 at 18:47, Daniel Gruno <hu...@apache.org> wrote:
>
> Apoologies, should be in working condition again now.
>
> On 14/08/2020 16.55, sebb wrote:
> > This commit broke the functioning of the --generator option for the archiver.
> >
> > Please fix.
> >
> > On Mon, 10 Aug 2020 at 17:13, <hu...@apache.org> wrote:
> >>
> >> This is an automated email from the ASF dual-hosted git repository.
> >>
> >> humbedooh pushed a commit to branch master
> >> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
> >>
> >>
> >> The following commit(s) were added to refs/heads/master by this push:
> >>       new 95beb51  Pull main operation into main(), linting/PEP conformity fixes
> >> 95beb51 is described below
> >>
> >> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
> >> Author: Daniel Gruno <hu...@apache.org>
> >> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
> >>
> >>      Pull main operation into main(), linting/PEP conformity fixes
> >>
> >>      Also prepping for adding ES 7.x support.
> >>      import-mbox will need to be updated shortly
> >> ---
> >>   tools/archiver.py | 123 +++++++++++++++++++++++++-----------------------------
> >>   1 file changed, 58 insertions(+), 65 deletions(-)
> >>
> >> diff --git a/tools/archiver.py b/tools/archiver.py
> >> index b97db76..2c50f19 100755
> >> --- a/tools/archiver.py
> >> +++ b/tools/archiver.py
> >> @@ -62,23 +62,19 @@ import uuid
> >>   import json
> >>   import certifi
> >>   import urllib.parse
> >> +import argparse
> >> +import netaddr
> >>
> >>   # Fetch config
> >>   config = PonymailConfig()
> >>   auth = None
> >> -parseHTML = False
> >> -iBody = None  # N.B. Also used by import-mbox.py
> >> -args=None
> >> -dumpDir = None
> >> -aURL = None
> >>
> >>   if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
> >>       from zope.interface import implementer
> >>       from mailman.interfaces.archiver import IArchiver
> >>       from mailman.interfaces.archiver import ArchivePolicy
> >>       logger = logging.getLogger("mailman.archiver")
> >> -elif __name__ == '__main__':
> >> -    import argparse
> >> +
> >>
> >>   if config.has_option('archiver', 'baseurl'):
> >>       aURL = config.get('archiver', 'baseurl')
> >> @@ -102,8 +98,7 @@ def parse_attachment(part):
> >>           if cdtype == "attachment" or cdtype == 'inline':
> >>               fd = part.get_payload(decode=True)
> >>               # Allow for empty string
> >> -            if fd is None:
> >> -                return None, None
> >> +            if fd is None: return None, None
> >>               filename = part.get_filename()
> >>               if filename:
> >>                   print("Found attachment: %s" % filename)
> >> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>
> >>       """ Intercept index calls and fix up consistency argument """
> >>       def index(self, **kwargs):
> >> -        if ES_MAJOR in [5,6]:
> >> +        if ES_MAJOR in [5,6,7]:
> >>               if kwargs.pop('consistency', None): # drop the key if present
> >>                   if self.wait_for_active_shards: # replace with wait if defined
> >>                       kwargs['wait_for_active_shards'] = self.wait_for_active_shards
> >> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>               **kwargs
> >>           )
> >>
> >> -    def __init__(self, parseHTML=False):
> >> +    def __init__(self, generator='full', parse_html=False, dump_dir=None):
> >>           """ Just initialize ES. """
> >> -        self.html = parseHTML
> >> -        if parseHTML:
> >> +        self.html = parse_html
> >> +        self.generator = generator
> >> +        if parse_html:
> >>               import html2text
> >>               self.html2text = html2text.html2text
> >>           self.dbname = config.get("elasticsearch", "dbname")
> >> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>           self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
> >>           if ES_MAJOR == 2:
> >>               pass
> >> -        elif ES_MAJOR in [5,6]:
> >> +        elif ES_MAJOR in [5,6,7]:
> >>               self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
> >>           else:
> >>               raise Exception("Unexpected elasticsearch version ", ES_VERSION)
> >> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>               }
> >>               )
> >>           # If we have a dump dir, we can risk failing the connection.
> >> -        if dumpDir:
> >> +        if dump_dir:
> >>               try:
> >>                   self.es = Elasticsearch(dbs,
> >>                       max_retries=5,
> >>                       retry_on_timeout=True
> >>                       )
> >>               except:
> >> -                print("ES connection failed, but dumponfail specified, dumping to %s" % dumpDir)
> >> +                print("ES connection failed, but dumponfail specified, dumping to %s" % dump_dir)
> >>           else:
> >>               self.es = Elasticsearch(dbs,
> >>                   max_retries=5,
> >> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>                   contents[part_meta['hash']] = part_file
> >>           return attachments, contents
> >>
> >> -
> >> -    def msgbody(self, msg):
> >> +    def msgbody(self, msg, verbose=False, ignore_body=None):
> >>           body = None
> >>           firstHTML = None
> >>           for part in msg.walk():
> >>               # can be called from importer
> >> -            if args and args.verbose:
> >> +            if verbose:
> >>                   print("Content-Type: %s" % part.get_content_type())
> >>               """
> >>                   Find the first body part and the first HTML part
> >> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>                   if not body and part.get_content_type() == 'text/enriched':
> >>                       body = part.get_payload(decode=True)
> >>                   elif self.html and not firstHTML and part.get_content_type() == 'text/html':
> >> -                    firstHTML = part.get_payload(decode=True)
> >> +                    first_html = part.get_payload(decode=True)
> >>               except Exception as err:
> >>                   print(err)
> >>
> >>           # this requires a GPL lib, user will have to install it themselves
> >> -        if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)):
> >> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
> >>               body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
> >>
> >>           # See issue#463
> >> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>           return body
> >>
> >>       # N.B. this is also called by import-mbox.py
> >> -    def compute_updates(self, lid, private, msg):
> >> +    def compute_updates(self, args, lid, private, msg):
> >>           """Determine what needs to be sent to the archiver.
> >>
> >>           :param lid: The list id
> >> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>           # mdate calculations are all done, prepare the index entry
> >>           epoch = email.utils.mktime_tz(mdate)
> >>           mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
> >> -        body = self.msgbody(msg)
> >> +        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
> >>           try:
> >>               if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
> >>                   body = convertToWrapped(body, character_set="utf-8")
> >> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>               except Exception as err:
> >>                   if logger:
> >>                       # N.B. use .get just in case there is no message-id
> >> -                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id','?'))
> >> +                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id', '?'))
> >>                   mid = pmid
> >> +
> >>               if 'in-reply-to' in msg_metadata:
> >>                   try:
> >>                       try:
> >> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>
> >>           return  ojson, contents, msg_metadata, irt
> >>
> >> -    def archive_message(self, mlist, msg, raw_msg):
> >> +    def archive_message(self, args, mlist, msg, raw_message):
> >>           """Send the message to the archiver.
> >>
> >>           :param mlist: The IMailingList object.
> >> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>           elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
> >>               private = True
> >>
> >> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
> >> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
> >>           if not ojson:
> >>               _id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
> >>               raise Exception("Could not parse message %s for %s" % (_id,lid))
> >> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>                   consistency = self.consistency,
> >>                   body = {
> >>                       "message-id": msg_metadata['message-id'],
> >> -                    "source": self.mbox_source(raw_msg)
> >> +                    "source": self.mbox_source(raw_message)
> >>                   }
> >>               )
> >>           # If we have a dump dir and ES failed, push to dump dir instead as a JSON object
> >>           # We'll leave it to another process to pick up the slack.
> >>           except Exception as err:
> >> -            if dumpDir:
> >> +            if args.dump:
> >>                   print("Pushing to ES failed, but dumponfail specified, dumping JSON docs")
> >>                   uid = uuid.uuid4()
> >> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
> >> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
> >>                   with open(mboxPath, "w") as f:
> >>                       json.dump({
> >>                           'id': ojson['mid'],
> >>                           'mbox': ojson,
> >>                           'mbox_source': {
> >>                               "message-id": msg_metadata['message-id'],
> >> -                            "source": self.mbox_source(raw_msg)
> >> +                            "source": self.mbox_source(raw_message)
> >>                           },
> >>                           'attachments': contents
> >>                       },f , indent = 2)
> >> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>               if dm:
> >>                   cid = dm.group(1)
> >>                   mid = dm.group(2)
> >> -                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
> >> -                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
> >> +                if self.es.exists(index=self.dbname, doc_type='account', id=cid):
> >> +                    doc = self.es.get(index=self.dbname, doc_type='account', id=cid)
> >>                       if doc:
> >>                           oldrefs.append(cid)
> >>                           # N.B. no index is supplied, so ES will generate one
> >> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
> >>           """
> >>           return None
> >>
> >> -if __name__ == '__main__':
> >> +
> >> +def main():
> >>       parser = argparse.ArgumentParser(description='Command line options.')
> >>       parser.add_argument('--lid', dest='lid', type=str, nargs=1,
> >>                          help='Alternate specific list ID')
> >> @@ -593,16 +590,15 @@ if __name__ == '__main__':
> >>                          help='Try to convert HTML to text if no text/plain message is found')
> >>       parser.add_argument('--dry', dest='dry', action='store_true',
> >>                          help='Do not save emails to elasticsearch, only test parsing')
> >> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
> >> +                        help='Optional email bodies to treat as empty (in conjunction with --html2text)')
> >>       parser.add_argument('--dumponfail', dest='dump',
> >> -                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and fail silently.')
> >> +                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and '
> >> +                            'fail silently.')
> >>       parser.add_argument('--generator', dest='generator',
> >>                          help='Override the generator.')
> >>       args = parser.parse_args()
> >>
> >> -    if args.html2text:
> >> -        parseHTML = True
> >> -    if args.dump:
> >> -        dumpDir = args.dump
> >>       if args.verbose:
> >>           logging.basicConfig(stream=sys.stdout, level=logging.INFO)
> >>       else:
> >> @@ -610,39 +606,34 @@ if __name__ == '__main__':
> >>           # Also eliminates: 'Undecodable raw error response from server:' warning message
> >>           logging.getLogger("elasticsearch").setLevel(logging.ERROR)
> >>
> >> -    if args.generator:
> >> -        archiver_generator = args.generator
> >> -
> >> -    archie = Archiver(parseHTML = parseHTML)
> >> +    archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text)
> >>       # use binary input so parser can use appropriate charset
> >>       input_stream = sys.stdin.buffer
> >>
> >>       try:
> >> -        msgstring = input_stream.read()
> >> +        raw_message = input_stream.read()
> >>           try:
> >> -            msg = email.message_from_bytes(msgstring)
> >> +            msg = email.message_from_bytes(raw_message)
> >>           except Exception as err:
> >>               print("STDIN parser exception: %s" % err)
> >>
> >>           # We're reading from STDIN, so let's fake an MM3 call
> >>           ispublic = True
> >> -        ignorefrom = None
> >> -        allowfrom = None
> >>
> >>           if args.altheader:
> >> -            altheader = args.altheader[0]
> >> -            if altheader in msg:
> >> +            alt_header = args.altheader[0]
> >> +            if alt_header in msg:
> >>                   try:
> >> -                    msg.replace_header('List-ID', msg.get(altheader))
> >> +                    msg.replace_header('List-ID', msg.get(alt_header))
> >>                   except:
> >> -                    msg.add_header('list-id', msg.get(altheader))
> >> +                    msg.add_header('list-id', msg.get(alt_header))
> >>           elif 'altheader' in sys.argv:
> >> -            altheader = sys.argv[len(sys.argv)-1]
> >> -            if altheader in msg:
> >> +            alt_header = sys.argv[len(sys.argv)-1]
> >> +            if alt_header in msg:
> >>                   try:
> >> -                    msg.replace_header('List-ID', msg.get(altheader))
> >> +                    msg.replace_header('List-ID', msg.get(alt_header))
> >>                   except:
> >> -                    msg.add_header('list-id', msg.get(altheader))
> >> +                    msg.add_header('list-id', msg.get(alt_header))
> >>
> >>           # Set specific LID?
> >>           if args.lid and len(args.lid[0]) > 3:
> >> @@ -654,21 +645,21 @@ if __name__ == '__main__':
> >>
> >>           #Ignore based on --ignore flag?
> >>           if args.ignorefrom:
> >> -            ignorefrom = args.ignorefrom[0]
> >> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
> >> +            ignore_from = args.ignorefrom[0]
> >> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
> >>                   print("Ignoring message as instructed by --ignore flag")
> >>                   sys.exit(0)
> >>
> >>           # Check CIDR if need be
> >>           if args.allowfrom:
> >> -            from netaddr import IPNetwork, IPAddress
> >> -            c = IPNetwork(args.allowfrom[0])
> >> +
> >> +            c = netaddr.IPNetwork(args.allowfrom[0])
> >>               good = False
> >>               for line in msg.get_all('received') or []:
> >>                   m = re.search(r"from .+\[(.+)\]", line)
> >>                   if m:
> >>                       try:
> >> -                        ip = IPAddress(m.group(1))
> >> +                        ip = netaddr.IPAddress(m.group(1))
> >>                           if ip in c:
> >>                               good = True
> >>                               msg.add_header("ip-whitelisted", "yes")
> >> @@ -681,19 +672,18 @@ if __name__ == '__main__':
> >>           # Replace date header with $now?
> >>           if args.makedate:
> >>               msg.replace_header('date', email.utils.formatdate())
> >> -
> >> +        is_public = True
> >>           if args.private:
> >> -            ispublic = False
> >> +            is_public = False
> >>           if 'list-id' in msg:
> >>               if not msg.get('archived-at'):
> >>                   msg.add_header('archived-at', email.utils.formatdate())
> >> -            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
> >> +            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id=msg.get('list-id'),
> >> +                                                                               archive_public=is_public)
> >>
> >>               try:
> >> -                lid, mid = archie.archive_message(list_data, msg, msgstring)
> >> +                lid, mid = archie.archive_message(args, list_data, msg, raw_message)
> >>                   print("%s: Done archiving to %s as %s!" % (email.utils.formatdate(), lid, mid))
> >> -                if aURL:
> >> -                    print("Published as: %s/thread.html/%s" % (aURL, urllib.parse.quote(mid)))
> >>               except Exception as err:
> >>                   if args.verbose:
> >>                       traceback.print_exc()
> >> @@ -711,3 +701,6 @@ if __name__ == '__main__':
> >>               print("Could not parse email: %s (@ %s)" % (err, line))
> >>               sys.exit(-1)
> >>
> >> +
> >> +if __name__ == '__main__':
> >> +    main()
> >>
>

Re: [incubator-ponymail] branch master updated: Pull main operation into main(), linting/PEP conformity fixes

Posted by Daniel Gruno <hu...@apache.org>.

Apoologies, should be in working condition again now.

On 14/08/2020 16.55, sebb wrote:
> This commit broke the functioning of the --generator option for the archiver.
> 
> Please fix.
> 
> On Mon, 10 Aug 2020 at 17:13, <hu...@apache.org> wrote:
>>
>> This is an automated email from the ASF dual-hosted git repository.
>>
>> humbedooh pushed a commit to branch master
>> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
>>
>>
>> The following commit(s) were added to refs/heads/master by this push:
>>       new 95beb51  Pull main operation into main(), linting/PEP conformity fixes
>> 95beb51 is described below
>>
>> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
>> Author: Daniel Gruno <hu...@apache.org>
>> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
>>
>>      Pull main operation into main(), linting/PEP conformity fixes
>>
>>      Also prepping for adding ES 7.x support.
>>      import-mbox will need to be updated shortly
>> ---
>>   tools/archiver.py | 123 +++++++++++++++++++++++++-----------------------------
>>   1 file changed, 58 insertions(+), 65 deletions(-)
>>
>> diff --git a/tools/archiver.py b/tools/archiver.py
>> index b97db76..2c50f19 100755
>> --- a/tools/archiver.py
>> +++ b/tools/archiver.py
>> @@ -62,23 +62,19 @@ import uuid
>>   import json
>>   import certifi
>>   import urllib.parse
>> +import argparse
>> +import netaddr
>>
>>   # Fetch config
>>   config = PonymailConfig()
>>   auth = None
>> -parseHTML = False
>> -iBody = None  # N.B. Also used by import-mbox.py
>> -args=None
>> -dumpDir = None
>> -aURL = None
>>
>>   if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
>>       from zope.interface import implementer
>>       from mailman.interfaces.archiver import IArchiver
>>       from mailman.interfaces.archiver import ArchivePolicy
>>       logger = logging.getLogger("mailman.archiver")
>> -elif __name__ == '__main__':
>> -    import argparse
>> +
>>
>>   if config.has_option('archiver', 'baseurl'):
>>       aURL = config.get('archiver', 'baseurl')
>> @@ -102,8 +98,7 @@ def parse_attachment(part):
>>           if cdtype == "attachment" or cdtype == 'inline':
>>               fd = part.get_payload(decode=True)
>>               # Allow for empty string
>> -            if fd is None:
>> -                return None, None
>> +            if fd is None: return None, None
>>               filename = part.get_filename()
>>               if filename:
>>                   print("Found attachment: %s" % filename)
>> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>
>>       """ Intercept index calls and fix up consistency argument """
>>       def index(self, **kwargs):
>> -        if ES_MAJOR in [5,6]:
>> +        if ES_MAJOR in [5,6,7]:
>>               if kwargs.pop('consistency', None): # drop the key if present
>>                   if self.wait_for_active_shards: # replace with wait if defined
>>                       kwargs['wait_for_active_shards'] = self.wait_for_active_shards
>> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>               **kwargs
>>           )
>>
>> -    def __init__(self, parseHTML=False):
>> +    def __init__(self, generator='full', parse_html=False, dump_dir=None):
>>           """ Just initialize ES. """
>> -        self.html = parseHTML
>> -        if parseHTML:
>> +        self.html = parse_html
>> +        self.generator = generator
>> +        if parse_html:
>>               import html2text
>>               self.html2text = html2text.html2text
>>           self.dbname = config.get("elasticsearch", "dbname")
>> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>           self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
>>           if ES_MAJOR == 2:
>>               pass
>> -        elif ES_MAJOR in [5,6]:
>> +        elif ES_MAJOR in [5,6,7]:
>>               self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
>>           else:
>>               raise Exception("Unexpected elasticsearch version ", ES_VERSION)
>> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>               }
>>               )
>>           # If we have a dump dir, we can risk failing the connection.
>> -        if dumpDir:
>> +        if dump_dir:
>>               try:
>>                   self.es = Elasticsearch(dbs,
>>                       max_retries=5,
>>                       retry_on_timeout=True
>>                       )
>>               except:
>> -                print("ES connection failed, but dumponfail specified, dumping to %s" % dumpDir)
>> +                print("ES connection failed, but dumponfail specified, dumping to %s" % dump_dir)
>>           else:
>>               self.es = Elasticsearch(dbs,
>>                   max_retries=5,
>> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>                   contents[part_meta['hash']] = part_file
>>           return attachments, contents
>>
>> -
>> -    def msgbody(self, msg):
>> +    def msgbody(self, msg, verbose=False, ignore_body=None):
>>           body = None
>>           firstHTML = None
>>           for part in msg.walk():
>>               # can be called from importer
>> -            if args and args.verbose:
>> +            if verbose:
>>                   print("Content-Type: %s" % part.get_content_type())
>>               """
>>                   Find the first body part and the first HTML part
>> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>                   if not body and part.get_content_type() == 'text/enriched':
>>                       body = part.get_payload(decode=True)
>>                   elif self.html and not firstHTML and part.get_content_type() == 'text/html':
>> -                    firstHTML = part.get_payload(decode=True)
>> +                    first_html = part.get_payload(decode=True)
>>               except Exception as err:
>>                   print(err)
>>
>>           # this requires a GPL lib, user will have to install it themselves
>> -        if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)):
>> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
>>               body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
>>
>>           # See issue#463
>> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>           return body
>>
>>       # N.B. this is also called by import-mbox.py
>> -    def compute_updates(self, lid, private, msg):
>> +    def compute_updates(self, args, lid, private, msg):
>>           """Determine what needs to be sent to the archiver.
>>
>>           :param lid: The list id
>> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>           # mdate calculations are all done, prepare the index entry
>>           epoch = email.utils.mktime_tz(mdate)
>>           mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
>> -        body = self.msgbody(msg)
>> +        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
>>           try:
>>               if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
>>                   body = convertToWrapped(body, character_set="utf-8")
>> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>               except Exception as err:
>>                   if logger:
>>                       # N.B. use .get just in case there is no message-id
>> -                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id','?'))
>> +                    logger.warning("Could not generate MID: %s. MSGID: %s", err, msg_metadata.get('message-id', '?'))
>>                   mid = pmid
>> +
>>               if 'in-reply-to' in msg_metadata:
>>                   try:
>>                       try:
>> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>
>>           return  ojson, contents, msg_metadata, irt
>>
>> -    def archive_message(self, mlist, msg, raw_msg):
>> +    def archive_message(self, args, mlist, msg, raw_message):
>>           """Send the message to the archiver.
>>
>>           :param mlist: The IMailingList object.
>> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>           elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
>>               private = True
>>
>> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
>> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
>>           if not ojson:
>>               _id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
>>               raise Exception("Could not parse message %s for %s" % (_id,lid))
>> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>                   consistency = self.consistency,
>>                   body = {
>>                       "message-id": msg_metadata['message-id'],
>> -                    "source": self.mbox_source(raw_msg)
>> +                    "source": self.mbox_source(raw_message)
>>                   }
>>               )
>>           # If we have a dump dir and ES failed, push to dump dir instead as a JSON object
>>           # We'll leave it to another process to pick up the slack.
>>           except Exception as err:
>> -            if dumpDir:
>> +            if args.dump:
>>                   print("Pushing to ES failed, but dumponfail specified, dumping JSON docs")
>>                   uid = uuid.uuid4()
>> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
>> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
>>                   with open(mboxPath, "w") as f:
>>                       json.dump({
>>                           'id': ojson['mid'],
>>                           'mbox': ojson,
>>                           'mbox_source': {
>>                               "message-id": msg_metadata['message-id'],
>> -                            "source": self.mbox_source(raw_msg)
>> +                            "source": self.mbox_source(raw_message)
>>                           },
>>                           'attachments': contents
>>                       },f , indent = 2)
>> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>               if dm:
>>                   cid = dm.group(1)
>>                   mid = dm.group(2)
>> -                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
>> -                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
>> +                if self.es.exists(index=self.dbname, doc_type='account', id=cid):
>> +                    doc = self.es.get(index=self.dbname, doc_type='account', id=cid)
>>                       if doc:
>>                           oldrefs.append(cid)
>>                           # N.B. no index is supplied, so ES will generate one
>> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>>           """
>>           return None
>>
>> -if __name__ == '__main__':
>> +
>> +def main():
>>       parser = argparse.ArgumentParser(description='Command line options.')
>>       parser.add_argument('--lid', dest='lid', type=str, nargs=1,
>>                          help='Alternate specific list ID')
>> @@ -593,16 +590,15 @@ if __name__ == '__main__':
>>                          help='Try to convert HTML to text if no text/plain message is found')
>>       parser.add_argument('--dry', dest='dry', action='store_true',
>>                          help='Do not save emails to elasticsearch, only test parsing')
>> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
>> +                        help='Optional email bodies to treat as empty (in conjunction with --html2text)')
>>       parser.add_argument('--dumponfail', dest='dump',
>> -                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and fail silently.')
>> +                       help='If pushing to ElasticSearch fails, dump documents in JSON format to this directory and '
>> +                            'fail silently.')
>>       parser.add_argument('--generator', dest='generator',
>>                          help='Override the generator.')
>>       args = parser.parse_args()
>>
>> -    if args.html2text:
>> -        parseHTML = True
>> -    if args.dump:
>> -        dumpDir = args.dump
>>       if args.verbose:
>>           logging.basicConfig(stream=sys.stdout, level=logging.INFO)
>>       else:
>> @@ -610,39 +606,34 @@ if __name__ == '__main__':
>>           # Also eliminates: 'Undecodable raw error response from server:' warning message
>>           logging.getLogger("elasticsearch").setLevel(logging.ERROR)
>>
>> -    if args.generator:
>> -        archiver_generator = args.generator
>> -
>> -    archie = Archiver(parseHTML = parseHTML)
>> +    archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text)
>>       # use binary input so parser can use appropriate charset
>>       input_stream = sys.stdin.buffer
>>
>>       try:
>> -        msgstring = input_stream.read()
>> +        raw_message = input_stream.read()
>>           try:
>> -            msg = email.message_from_bytes(msgstring)
>> +            msg = email.message_from_bytes(raw_message)
>>           except Exception as err:
>>               print("STDIN parser exception: %s" % err)
>>
>>           # We're reading from STDIN, so let's fake an MM3 call
>>           ispublic = True
>> -        ignorefrom = None
>> -        allowfrom = None
>>
>>           if args.altheader:
>> -            altheader = args.altheader[0]
>> -            if altheader in msg:
>> +            alt_header = args.altheader[0]
>> +            if alt_header in msg:
>>                   try:
>> -                    msg.replace_header('List-ID', msg.get(altheader))
>> +                    msg.replace_header('List-ID', msg.get(alt_header))
>>                   except:
>> -                    msg.add_header('list-id', msg.get(altheader))
>> +                    msg.add_header('list-id', msg.get(alt_header))
>>           elif 'altheader' in sys.argv:
>> -            altheader = sys.argv[len(sys.argv)-1]
>> -            if altheader in msg:
>> +            alt_header = sys.argv[len(sys.argv)-1]
>> +            if alt_header in msg:
>>                   try:
>> -                    msg.replace_header('List-ID', msg.get(altheader))
>> +                    msg.replace_header('List-ID', msg.get(alt_header))
>>                   except:
>> -                    msg.add_header('list-id', msg.get(altheader))
>> +                    msg.add_header('list-id', msg.get(alt_header))
>>
>>           # Set specific LID?
>>           if args.lid and len(args.lid[0]) > 3:
>> @@ -654,21 +645,21 @@ if __name__ == '__main__':
>>
>>           #Ignore based on --ignore flag?
>>           if args.ignorefrom:
>> -            ignorefrom = args.ignorefrom[0]
>> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
>> +            ignore_from = args.ignorefrom[0]
>> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
>>                   print("Ignoring message as instructed by --ignore flag")
>>                   sys.exit(0)
>>
>>           # Check CIDR if need be
>>           if args.allowfrom:
>> -            from netaddr import IPNetwork, IPAddress
>> -            c = IPNetwork(args.allowfrom[0])
>> +
>> +            c = netaddr.IPNetwork(args.allowfrom[0])
>>               good = False
>>               for line in msg.get_all('received') or []:
>>                   m = re.search(r"from .+\[(.+)\]", line)
>>                   if m:
>>                       try:
>> -                        ip = IPAddress(m.group(1))
>> +                        ip = netaddr.IPAddress(m.group(1))
>>                           if ip in c:
>>                               good = True
>>                               msg.add_header("ip-whitelisted", "yes")
>> @@ -681,19 +672,18 @@ if __name__ == '__main__':
>>           # Replace date header with $now?
>>           if args.makedate:
>>               msg.replace_header('date', email.utils.formatdate())
>> -
>> +        is_public = True
>>           if args.private:
>> -            ispublic = False
>> +            is_public = False
>>           if 'list-id' in msg:
>>               if not msg.get('archived-at'):
>>                   msg.add_header('archived-at', email.utils.formatdate())
>> -            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
>> +            list_data = namedtuple('importmsg', ['list_id', 'archive_public'])(list_id=msg.get('list-id'),
>> +                                                                               archive_public=is_public)
>>
>>               try:
>> -                lid, mid = archie.archive_message(list_data, msg, msgstring)
>> +                lid, mid = archie.archive_message(args, list_data, msg, raw_message)
>>                   print("%s: Done archiving to %s as %s!" % (email.utils.formatdate(), lid, mid))
>> -                if aURL:
>> -                    print("Published as: %s/thread.html/%s" % (aURL, urllib.parse.quote(mid)))
>>               except Exception as err:
>>                   if args.verbose:
>>                       traceback.print_exc()
>> @@ -711,3 +701,6 @@ if __name__ == '__main__':
>>               print("Could not parse email: %s (@ %s)" % (err, line))
>>               sys.exit(-1)
>>
>> +
>> +if __name__ == '__main__':
>> +    main()
>>