You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by se...@apache.org on 2018/05/23 19:22:59 UTC

[incubator-ponymail] branch master updated: Enh: remove duplicated code in tools scripts

This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f7125f  Enh: remove duplicated code in tools scripts
6f7125f is described below

commit 6f7125f2e1b881a90adea5942a1a7f29730e8f99
Author: Sebb <se...@apache.org>
AuthorDate: Wed May 23 20:22:57 2018 +0100

    Enh: remove duplicated code in tools scripts
    
     by using elastic.py module
    
     This fixes #456
---
 CHANGELOG.md           |  1 +
 tools/copy-list.py     | 50 +++++++++----------------------------
 tools/edit-list.py     | 38 +++++++---------------------
 tools/list-lists.py    | 67 ++++++++++++++++----------------------------------
 tools/push-failures.py | 36 +++------------------------
 5 files changed, 47 insertions(+), 145 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e02fb60..d43917f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
 ## Changes in 0.11-SNAPSHOT
+- Enh: remove duplicated code in tools scripts by using elastic.py module (#456)
 - Enh: separate module to read config file 
 - Bug: push-failures.py expects to find non-existent 'id' key in json file (#454)
 - Bug: ES 5.x does not support word-cloud (stats.lua) (#345)
diff --git a/tools/copy-list.py b/tools/copy-list.py
index e727530..1907cf2 100755
--- a/tools/copy-list.py
+++ b/tools/copy-list.py
@@ -25,19 +25,9 @@ This utility can be used to:
 
 import sys
 import time
-import configparser
 import argparse
 
-try:
-    from elasticsearch import Elasticsearch, helpers, ElasticsearchException
-except ImportError:
-    print("Sorry, you need to install the elasticsearch and formatflowed modules from pip first.")
-    sys.exit(-1)
-    
-
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
+from elastic import Elastic    
 
 sourceLID = None
 targetLID = None
@@ -46,23 +36,10 @@ debug = False
 notag = False
 newdb = None
 
-ssl = False
-dbname = config.get("elasticsearch", "dbname")
-if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower() == 'true':
-    ssl = True
-uri = ""
-if config.has_option("elasticsearch", "uri") and config.get("elasticsearch", "uri") != "":
-    uri = config.get("elasticsearch", "uri")
-es = Elasticsearch([
-    {
-        'host': config.get("elasticsearch", "hostname"),
-        'port': int(config.get("elasticsearch", "port")),
-        'use_ssl': ssl,
-        'url_prefix': uri
-    }],
-    max_retries=5,
-    retry_on_timeout=True
-    )
+# get config and set up default databas
+es = Elastic()
+# default database name
+dbname = es.getdbname()
 
 rootURL = ""
 
@@ -126,7 +103,6 @@ count = 0
 print("Updating docs...")
 then = time.time()
 page = es.search(
-    index=dbname,
     doc_type="mbox",
     scroll = '30m',
     search_type = 'scan',
@@ -154,12 +130,7 @@ while (scroll_size > 0):
     scroll_size = len(page['hits']['hits'])
     for hit in page['hits']['hits']:
         doc = hit['_id']
-        body = es.get(index = dbname, doc_type = 'mbox', id = doc)
-        source = None
-        try:
-            source = es.get(index = dbname, doc_type = 'mbox_source', id = doc)
-        except ElasticsearchException:
-            print("Source for %s not found, hmm..." % doc)
+        body = es.get(doc_type = 'mbox', id = doc)
         if targetLID != sourceLID:
             doc = hit['_id'].replace(sourceLID,targetLID)
             body['_source']['mid'] = doc
@@ -172,7 +143,8 @@ while (scroll_size > 0):
             '_id': doc,
             '_source': body['_source']
         })
-        if source:
+        source = es.get(doc_type = 'mbox_source', id = doc, ignore=404)
+        if source['found']:
             js_arr.append({
                 '_op_type': 'index',
                 '_index': newdb if newdb else dbname,
@@ -180,14 +152,16 @@ while (scroll_size > 0):
                 '_id': doc,
                 '_source': source['_source']
             })
+        else:
+            print("Source for %s not found, hmm..." % doc)
         
         count += 1
         if (count % 50 == 0):
             print("Processed %u emails..." % count)
-            helpers.bulk(es, js_arr)
+            es.bulk(js_arr)
             js_arr = []
 
 if len(js_arr) > 0:
-    helpers.bulk(es, js_arr)
+    es.bulk(js_arr)
             
 print("All done, processed %u docs in %u seconds" % (count, time.time() - then))
diff --git a/tools/edit-list.py b/tools/edit-list.py
index 61f1f4f..bdbba98 100755
--- a/tools/edit-list.py
+++ b/tools/edit-list.py
@@ -29,35 +29,17 @@ This utility can be used to:
 
 import sys
 import time
-import configparser
 import argparse
 import json
 
-try:
-    from elasticsearch import Elasticsearch, helpers
-except ImportError:
-    print("Sorry, you need to install the elasticsearch and formatflowed modules from pip first.")
-    sys.exit(-1)
-
-
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
-
-dbname = config.get("elasticsearch", "dbname")
-ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true'
-uri = config.get("elasticsearch", "uri", fallback="")
+from elastic import Elastic
 
-es = Elasticsearch([
-    {
-        'host': config.get("elasticsearch", "hostname"),
-        'port': int(config.get("elasticsearch", "port")),
-        'use_ssl': ssl,
-        'url_prefix': uri
-    }],
-    max_retries=5,
-    retry_on_timeout=True
-    )
+es = Elastic()
+dbname = es.getdbname()
+# get config and set up default databas
+es = Elastic()
+# default database name
+dbname = es.getdbname()
 
 parser = argparse.ArgumentParser(description='Command line options.')
 # Cannot have both source and mid as input
@@ -158,7 +140,6 @@ if desc:
         if targetLID:
             LID = targetLID
         es.index(
-            index=dbname,
             doc_type="mailinglists",
             id=LID,
             body = {
@@ -185,7 +166,6 @@ if targetLID or makePrivate or makePublic or deleteEmails or mid or obfuscate:
             }
         }
     page = es.search(
-        index=dbname,
         doc_type="mbox",
         scroll = '30m',
         search_type = 'scan',
@@ -239,11 +219,11 @@ if targetLID or makePrivate or makePublic or deleteEmails or mid or obfuscate:
             if (count % 500 == 0):
                 print("Processed %u emails..." % count)
                 if not dryrun:
-                    helpers.bulk(es, js_arr)
+                    es.bulk(js_arr)
                     js_arr = []
 
     if len(js_arr) > 0:
         if not dryrun:
-            helpers.bulk(es, js_arr)
+            es.bulk(js_arr)
 
     print("All done, processed %u docs in %u seconds" % (count, time.time() - then))
diff --git a/tools/list-lists.py b/tools/list-lists.py
index 6509efc..8fa9678 100755
--- a/tools/list-lists.py
+++ b/tools/list-lists.py
@@ -15,51 +15,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import time
-import configparser
 import argparse
 import json
 
-try:
-    from elasticsearch import Elasticsearch
-except ImportError:
-    print("Sorry, you need to install the elasticsearch module from pip first.")
-    sys.exit(-1)
-    
+from elastic import Elastic
 
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
+dbname=None
 
-makePublic = None
-makePrivate = None
-sourceLID = None
-targetLID = None
-deleteEmails = None
-wildcard = None
+parser = argparse.ArgumentParser(description='Command line options.')
+parser.add_argument('--dbname', dest='dbname', type=str, nargs=1,
+                   help='Override index name')
+parser.add_argument('--pretty', dest='pretty', action='store_true', 
+                   help='Convert List IDs to email addresses')
+parser.add_argument('--debug', dest='debug', action='store_true', 
+                   help='Output the result JSON instead, very noisy!')
+parser.add_argument('--counts', dest='counts', action='store_true', 
+                   help='Show the count of messages for each list')
 
-ssl = False
-dbname = config.get("elasticsearch", "dbname")
-if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower() == 'true':
-    ssl = True
-uri = ""
-if config.has_option("elasticsearch", "uri") and config.get("elasticsearch", "uri") != "":
-    uri = config.get("elasticsearch", "uri")
-es = Elasticsearch([
-    {
-        'host': config.get("elasticsearch", "hostname"),
-        'port': int(config.get("elasticsearch", "port")),
-        'use_ssl': ssl,
-        'url_prefix': uri
-    }],
-    max_retries=5,
-    retry_on_timeout=True
-    )
+args = parser.parse_args()
+
+if args.dbname:
+    dbname = args.dbname[0]
 
 then = time.time()
+
+# get config and set up default database
+# If dbname is None, the config setting will be used
+es = Elastic(dbname=dbname)
+
 page = es.search(
-    index=dbname,
     doc_type="mbox",
     size = 0,
     body = {
@@ -96,16 +81,6 @@ page = es.search(
     }
     )
 
-parser = argparse.ArgumentParser(description='Command line options.')
-parser.add_argument('--pretty', dest='pretty', action='store_true', 
-                   help='Convert List IDs to email addresses')
-parser.add_argument('--debug', dest='debug', action='store_true', 
-                   help='Output the result JSON instead, very noisy!')
-parser.add_argument('--counts', dest='counts', action='store_true', 
-                   help='Show the count of messages for each list')
-
-args = parser.parse_args()
-pretty = args.pretty
 plist = {}
 total_private = 0
 if args.debug:
@@ -116,7 +91,7 @@ else:
         msgcount = domain['doc_count']
         prvcount = domain['privacy']['doc_count']
         total_private += prvcount
-        if pretty:
+        if args.pretty:
             if listid.find(".") != -1:
                 l, d = listid.strip("<>").split(".", 1)
                 plist[d] = plist[d] if d in plist else {}
diff --git a/tools/push-failures.py b/tools/push-failures.py
index a4c57b5..2fc6a67 100755
--- a/tools/push-failures.py
+++ b/tools/push-failures.py
@@ -18,39 +18,12 @@
 """ Utility for retrying docs that we failed to index earlier.
 """
 
-import sys
-import configparser
 import argparse
 import json
 import os
-import certifi
+from elastic import Elastic
 
-try:
-    from elasticsearch import Elasticsearch
-except ImportError:
-    print("Sorry, you need to install the elasticsearch module from pip first.")
-    sys.exit(-1)
-
-
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
-
-dbname = config.get("elasticsearch", "dbname")
-ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true'
-uri = config.get("elasticsearch", "uri", fallback="")
-
-es = Elasticsearch([
-    {
-        'host': config.get("elasticsearch", "hostname"),
-        'port': int(config.get("elasticsearch", "port")),
-        'use_ssl': ssl,
-        'url_prefix': uri,
-        'ca_certs': certifi.where()
-    }],
-    max_retries=5,
-    retry_on_timeout=True
-    )
+es = Elastic()
 
 parser = argparse.ArgumentParser(description='Command line options.')
 # Cannot have both source and mid as input
@@ -61,6 +34,8 @@ args = parser.parse_args()
 
 dumpDir = args.dumpdir if args.dumpdir else '.'
 
+print("Looking for *.json files in %s" % dumpDir)
+
 files = [f for f in os.listdir(dumpDir) if os.path.isfile(os.path.join(dumpDir, f)) and f.endswith('.json')]
 
 for f in files:
@@ -74,14 +49,12 @@ for f in files:
             except KeyError:
                 mid = ojson['mbox']['mid']
             es.index(
-                index=dbname,
                 doc_type="mbox",
                 id=mid,
                 body = ojson['mbox']
             )
             
             es.index(
-                index=dbname,
                 doc_type="mbox_source",
                 id=mid,
                 body = ojson['mbox_source']
@@ -90,7 +63,6 @@ for f in files:
             if 'attachments' in ojson and ojson['attachments']:
                 for k, v in ojson['attachments'].items():
                     es.index(
-                        index=dbname,
                         doc_type="attachment",
                         id=k,
                         body = {

-- 
To stop receiving notification emails like this one, please contact
sebb@apache.org.