You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2016/06/01 12:41:10 UTC

[08/11] incubator-ponymail git commit: merge sync-imap into import-mbox

merge sync-imap into import-mbox


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/6127911a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/6127911a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/6127911a

Branch: refs/heads/master
Commit: 6127911a367a527423495c98f551d3939359eae9
Parents: 7135cd4
Author: Sam Ruby <ru...@intertwingly.net>
Authored: Fri May 6 12:46:31 2016 -0400
Committer: Sam Ruby <ru...@intertwingly.net>
Committed: Fri May 6 12:46:31 2016 -0400

----------------------------------------------------------------------
 tools/import-mbox.py | 124 +++++++++++++++++++++++++++--
 tools/sync-imap.py   | 197 ----------------------------------------------
 2 files changed, 118 insertions(+), 203 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/6127911a/tools/import-mbox.py
----------------------------------------------------------------------
diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 2220f72..0b59d1f 100644
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -45,6 +45,10 @@ except:
     print("Sorry, you need to install the elasticsearch and formatflowed modules from pip first.")
     sys.exit(-1)
     
+# change working directory to location of this script
+
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
 y = 0
 baddies = 0
 block = Lock()
@@ -57,6 +61,7 @@ appender = "apache.org"
 
 source = "./"
 maildir = False
+imap = False
 list_override = None
 project = ""
 filebased = False
@@ -174,7 +179,18 @@ class SlurpThread(Thread):
             EM = 1
             stime = time.time()
             dFile = False
-            if filebased:
+            if maildir:
+                messages = mailbox.Maildir(tmpname)
+            elif imap:
+                y -= 1 # TODO don't understand the increment above
+                imap4 = mla[2]
+                def mailgen(list):
+                    for uid in list:
+                        msgbytes = imap4.uid('fetch', uid, '(RFC822)')[1][0][1]
+                        yield email.message_from_bytes(msgbytes)
+                messages = mailgen(mla[0])
+                xlist_override = mla[1]
+            elif filebased:
                 
                 tmpname = mla[0]
                 filename = mla[0]
@@ -197,6 +213,8 @@ class SlurpThread(Thread):
                     except Exception as err:
                         print("This wasn't a gzip file: %s" % err )
                 print("Slurping %s" % filename)
+                messages = mailbox.mbox(tmpname)
+
             else:
                 ml = mla[0]
                 mboxfile = mla[1]
@@ -215,15 +233,11 @@ class SlurpThread(Thread):
                 with open(tmpname, "w") as f:
                     f.write(inp)
                     f.close()
+                messages = mailbox.mbox(tmpname)
 
             count = 0
             LEY = EY
 
-            if maildir:
-                messages = mailbox.Maildir(tmpname)
-            else:
-                messages = mailbox.mbox(tmpname)
-
             for message in messages:
                 if resendTo:
                     print("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??')
@@ -285,6 +299,8 @@ class SlurpThread(Thread):
                 print("Parsed %u records from %s" % (count, filename))
                 if dFile:
                     os.unlink(tmpname)
+            elif imap:
+                print("Parsed %u records from imap" % count)
             else:
                 print("Parsed %s/%s: %u records from %s" % (ml, mboxfile, count, tmpname))
                 os.unlink(tmpname)
@@ -460,6 +476,102 @@ elif source[0] == "h":
             if quickmode and qn >= 2:
                 break
                     
+# IMAP(S) based import?
+elif source[0] == "i":
+    imap = True
+    import urllib, getpass, imaplib
+    url = urllib.parse.urlparse(source)
+    
+    port = url.port or (143 if url.scheme == 'imap' else 993)
+    user = url.username or getpass.getuser()
+    password = url.password or getpass.getpass('IMAP Password: ')
+    folder = url.path.strip('/') or 'INBOX'
+    listname = list_override or "<%s/%s.%s>" % (user, folder, url.hostname)
+
+    # fetch message-id => _id pairs from elasticsearch
+
+    es = Elasticsearch()
+    result = es.search(scroll = '5m', 
+        body = {
+            'size': 1024, 
+            'fields': ['message-id'], 
+            'query': {'match': {'list': listname}}
+        }
+    )
+
+    db = {}
+    while len(result['hits']['hits']) > 0:
+        for hit in result['hits']['hits']:
+            db[hit['fields']['message-id'][0]] = hit['_id']
+        result = es.scroll(scroll='5m', scroll_id=result['_scroll_id'])
+
+    # fetch message-id => uid pairs from imap
+
+    if url.scheme == 'imaps':
+        imap4 = imaplib.IMAP4_SSL(url.hostname, port)
+    else:
+        imap4 = imaplib.IMAP4(url.hostname, port)
+    imap4.login(user, password)
+    imap4.select(folder, readonly=True)
+    results = imap4.uid('search', None, 'ALL')
+    uids = b','.join(results[1][0].split())
+    results = imap4.uid('fetch', uids, '(BODY[HEADER.FIELDS (MESSAGE-ID)])')
+
+    mail = {}
+    uid_re = re.compile(b'^\d+ \(UID (\d+) BODY\[')
+    mid_re = re.compile(b'^Message-ID:\s*(.*?)\s*$', re.I)
+    uid = None
+    for result in results[1]:
+        for line in result:
+            if isinstance(line, bytes):
+                match = uid_re.match(line)
+                if match:
+                    uid = match.group(1)
+                else:
+                     match = mid_re.match(line)
+                     if match:
+                         try:
+                             mail[match.group(1).decode('utf-8')] = uid
+                             uid = None
+                         except ValueError:
+                             pass
+
+    # delete items from elasticsearch that are not present in imap
+
+    queue1 = []
+    queue2 = []
+    for mid, _id in db.items():
+        if not mid in mail:
+            queue1.append({
+                '_op_type': 'delete',
+                '_index': dbname,
+                '_type': 'mbox',
+                '_id': _id
+            })
+            queue2.append({
+                '_op_type': 'delete',
+                '_index': dbname,
+                '_type': 'mbox_source',
+                '_id': _id
+            })
+            print("deleting: " + mid)
+
+    while len(queue1) > 0:
+        eshelper.bulk(es, queue1[0:1024])
+        del queue1[0:1024]
+
+    while len(queue2) > 0:
+        eshelper.bulk(es, queue2[0:1024])
+        del queue2[0:1024]
+
+    # add new items to elasticsearch from imap
+
+    uids = []
+    for mid, uid in mail.items():
+        if not mid in db:
+            uids.append(uid)
+    lists.append([uids, listname, imap4])
+
 threads = []
 cc = int( multiprocessing.cpu_count() / 2) + 1
 print("Starting up to %u threads to fetch the %u %s lists" % (cc, len(lists), project))

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/6127911a/tools/sync-imap.py
----------------------------------------------------------------------
diff --git a/tools/sync-imap.py b/tools/sync-imap.py
deleted file mode 100755
index 14d2711..0000000
--- a/tools/sync-imap.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/env python3.4
-# -*- coding: utf-8 -*-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
- #the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Syncronize ponymail with an imap server.
-
-Fetches message-ids from both ponymail and an imap server, and adds or
-deletes whatever is necessary from ponymail to make it match.
-
-See usage for instructions.
-
-"""
-
-import argparse
-import configparser
-import elasticsearch
-from elasticsearch import helpers as eshelper
-import imaplib
-import os
-import pwd
-import subprocess
-import sys
-import re
-
-# change working directory to location of this script
-
-os.chdir(os.path.dirname(os.path.abspath(__file__)))
-
-# global defaults
-
-es_list = None
-imap_host = 'localhost'
-imap_port = 993
-imap_user = pwd.getpwuid(os.getuid()).pw_name
-imap_password = None
-imap_folder = 'INBOX'
-html2text = False
-verbose = False
-
-# fetch config overrides
-
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
-iname = config.get("elasticsearch", "dbname")
-if config.has_option('imap', 'host'):
-    imap_host = config.get('imap', 'host')
-if config.has_option('imap', 'port'):
-    imap_port = config.getint('imap', 'port')
-if config.has_option('imap', 'user'):
-    imap_user = config.getint('imap', 'user')
-if config.has_option('imap', 'password'):
-    imap_password = config.getint('imap', 'password')
-
-# fetch command line argument overrides
-
-parser = argparse.ArgumentParser(description='Command line options.')
-parser.add_argument('--list', dest='list', type=str, nargs=1,
-                   help='ElasticSearch list')
-parser.add_argument('--host', dest='host', type=str, nargs=1,
-                   help='IMAP host')
-parser.add_argument('--port', dest='port', type=int, nargs=1,
-                   help='IMAP port')
-parser.add_argument('--user', dest='user', type=str, nargs=1,
-                   help='IMAP user')
-parser.add_argument('--password', dest='password', type=str, nargs=1,
-                   help='IMAP password')
-parser.add_argument('--folder', dest='folder', type=str, nargs=1,
-                   help='IMAP folder')
-parser.add_argument('--html2text', dest='html2text', action='store_true',
-                   help='Try to convert HTML to text if no text/plain message is found')
-parser.add_argument('--verbose', dest='verbose', action='store_true', 
-                   help='Output additional log messages')
-
-args = parser.parse_args()
-
-if args.list:
-    es_list = args.list[0]
-if args.host:
-    imap_host = args.host[0]
-if args.port:
-    imap_port = args.port[0]
-if args.user:
-    imap_user = args.user[0]
-if args.password:
-    imap_password = args.password[0]
-if args.folder:
-    imap_folder = args.folder[0]
-if args.html2text:
-    html2text = True
-if args.verbose:
-    verbose = True
-
-if not es_list or not imap_password:
-    parser.print_help()
-    sys.exit(-1)
-
-es_list = "<%s>" % es_list.strip("<>") # We need <> around it!
-
-# fetch message-id => _id pairs from elasticsearch
-
-es = elasticsearch.Elasticsearch()
-result = es.search(scroll = '5m', 
-    body = {
-        'size': 1024, 
-        'fields': ['message-id'], 
-        'query': {'match': {'list': es_list}}
-    }
-)
-
-db = {}
-while len(result['hits']['hits']) > 0:
-    for hit in result['hits']['hits']:
-        db[hit['fields']['message-id'][0]] = hit['_id']
-    result = es.scroll(scroll='5m', scroll_id=result['_scroll_id'])
-
-# fetch message-id => uid pairs from imap
-
-imap = imaplib.IMAP4_SSL(imap_host, imap_port)
-imap.login(imap_user, imap_password)
-imap.select(imap_folder, readonly=True)
-results = imap.uid('search', None, 'ALL')
-uids = b','.join(results[1][0].split())
-results = imap.uid('fetch', uids, '(BODY[HEADER.FIELDS (MESSAGE-ID)])')
-
-mail = {}
-uid_re = re.compile(b'^\d+ \(UID (\d+) BODY\[')
-mid_re = re.compile(b'^Message-ID:\s*(.*?)\s*$', re.I)
-uid = None
-for result in results[1]:
-    for line in result:
-        if isinstance(line, bytes):
-            match = uid_re.match(line)
-            if match:
-                uid = match.group(1)
-            else:
-                 match = mid_re.match(line)
-                 if match:
-                     try:
-                         mail[match.group(1).decode('utf-8')] = uid
-                         uid = None
-                     except ValueError:
-                         pass
-
-# delete items from elasticsearch that are not present in imap
-
-queue1 = []
-queue2 = []
-for mid, _id in db.items():
-    if True: # not mid in mail:
-        queue1.append({
-            '_op_type': 'delete',
-            '_index': iname,
-            '_type': 'mbox',
-            '_id': _id
-        })
-        queue2.append({
-            '_op_type': 'delete',
-            '_index': iname,
-            '_type': 'mbox_source',
-            '_id': _id
-        })
-        print("deleting: " + mid)
-
-while len(queue1) > 0:
-    eshelper.bulk(es, queue1[0:1024])
-    del queue1[0:1024]
-
-while len(queue2) > 0:
-    eshelper.bulk(es, queue2[0:1024])
-    del queue2[0:1024]
-
-# add new items to elasticsearch from imap
-
-for mid, uid in mail.items():
-    if not mid in db:
-        print("indexing %s" % mid)
-        argv = [sys.executable, 'archiver.py', '--lid=%s' % es_list]
-        if verbose: argv.append('--verbose')
-        if html2text: argv.append('--html2text')
-        child = subprocess.Popen(argv, stdin=subprocess.PIPE)
-        child.stdin.write(imap.uid('fetch', uid, '(RFC822)')[1][0][1])
-        child.stdin.close()
-        rc = child.wait()
-        if rc != 0: print("rc %d" % rc)