You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by se...@apache.org on 2018/05/23 19:22:59 UTC
[incubator-ponymail] branch master updated: Enh: remove duplicated
code in tools scripts
This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
The following commit(s) were added to refs/heads/master by this push:
new 6f7125f Enh: remove duplicated code in tools scripts
6f7125f is described below
commit 6f7125f2e1b881a90adea5942a1a7f29730e8f99
Author: Sebb <se...@apache.org>
AuthorDate: Wed May 23 20:22:57 2018 +0100
Enh: remove duplicated code in tools scripts
by using elastic.py module
This fixes #456
---
CHANGELOG.md | 1 +
tools/copy-list.py | 50 +++++++++----------------------------
tools/edit-list.py | 38 +++++++---------------------
tools/list-lists.py | 67 ++++++++++++++++----------------------------------
tools/push-failures.py | 36 +++------------------------
5 files changed, 47 insertions(+), 145 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e02fb60..d43917f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
## Changes in 0.11-SNAPSHOT
+- Enh: remove duplicated code in tools scripts by using elastic.py module (#456)
- Enh: separate module to read config file
- Bug: push-failures.py expects to find non-existent 'id' key in json file (#454)
- Bug: ES 5.x does not support word-cloud (stats.lua) (#345)
diff --git a/tools/copy-list.py b/tools/copy-list.py
index e727530..1907cf2 100755
--- a/tools/copy-list.py
+++ b/tools/copy-list.py
@@ -25,19 +25,9 @@ This utility can be used to:
import sys
import time
-import configparser
import argparse
-try:
- from elasticsearch import Elasticsearch, helpers, ElasticsearchException
-except ImportError:
- print("Sorry, you need to install the elasticsearch and formatflowed modules from pip first.")
- sys.exit(-1)
-
-
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
+from elastic import Elastic
sourceLID = None
targetLID = None
@@ -46,23 +36,10 @@ debug = False
notag = False
newdb = None
-ssl = False
-dbname = config.get("elasticsearch", "dbname")
-if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower() == 'true':
- ssl = True
-uri = ""
-if config.has_option("elasticsearch", "uri") and config.get("elasticsearch", "uri") != "":
- uri = config.get("elasticsearch", "uri")
-es = Elasticsearch([
- {
- 'host': config.get("elasticsearch", "hostname"),
- 'port': int(config.get("elasticsearch", "port")),
- 'use_ssl': ssl,
- 'url_prefix': uri
- }],
- max_retries=5,
- retry_on_timeout=True
- )
+# get config and set up default databas
+es = Elastic()
+# default database name
+dbname = es.getdbname()
rootURL = ""
@@ -126,7 +103,6 @@ count = 0
print("Updating docs...")
then = time.time()
page = es.search(
- index=dbname,
doc_type="mbox",
scroll = '30m',
search_type = 'scan',
@@ -154,12 +130,7 @@ while (scroll_size > 0):
scroll_size = len(page['hits']['hits'])
for hit in page['hits']['hits']:
doc = hit['_id']
- body = es.get(index = dbname, doc_type = 'mbox', id = doc)
- source = None
- try:
- source = es.get(index = dbname, doc_type = 'mbox_source', id = doc)
- except ElasticsearchException:
- print("Source for %s not found, hmm..." % doc)
+ body = es.get(doc_type = 'mbox', id = doc)
if targetLID != sourceLID:
doc = hit['_id'].replace(sourceLID,targetLID)
body['_source']['mid'] = doc
@@ -172,7 +143,8 @@ while (scroll_size > 0):
'_id': doc,
'_source': body['_source']
})
- if source:
+ source = es.get(doc_type = 'mbox_source', id = doc, ignore=404)
+ if source['found']:
js_arr.append({
'_op_type': 'index',
'_index': newdb if newdb else dbname,
@@ -180,14 +152,16 @@ while (scroll_size > 0):
'_id': doc,
'_source': source['_source']
})
+ else:
+ print("Source for %s not found, hmm..." % doc)
count += 1
if (count % 50 == 0):
print("Processed %u emails..." % count)
- helpers.bulk(es, js_arr)
+ es.bulk(js_arr)
js_arr = []
if len(js_arr) > 0:
- helpers.bulk(es, js_arr)
+ es.bulk(js_arr)
print("All done, processed %u docs in %u seconds" % (count, time.time() - then))
diff --git a/tools/edit-list.py b/tools/edit-list.py
index 61f1f4f..bdbba98 100755
--- a/tools/edit-list.py
+++ b/tools/edit-list.py
@@ -29,35 +29,17 @@ This utility can be used to:
import sys
import time
-import configparser
import argparse
import json
-try:
- from elasticsearch import Elasticsearch, helpers
-except ImportError:
- print("Sorry, you need to install the elasticsearch and formatflowed modules from pip first.")
- sys.exit(-1)
-
-
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
-
-dbname = config.get("elasticsearch", "dbname")
-ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true'
-uri = config.get("elasticsearch", "uri", fallback="")
+from elastic import Elastic
-es = Elasticsearch([
- {
- 'host': config.get("elasticsearch", "hostname"),
- 'port': int(config.get("elasticsearch", "port")),
- 'use_ssl': ssl,
- 'url_prefix': uri
- }],
- max_retries=5,
- retry_on_timeout=True
- )
+es = Elastic()
+dbname = es.getdbname()
+# get config and set up default databas
+es = Elastic()
+# default database name
+dbname = es.getdbname()
parser = argparse.ArgumentParser(description='Command line options.')
# Cannot have both source and mid as input
@@ -158,7 +140,6 @@ if desc:
if targetLID:
LID = targetLID
es.index(
- index=dbname,
doc_type="mailinglists",
id=LID,
body = {
@@ -185,7 +166,6 @@ if targetLID or makePrivate or makePublic or deleteEmails or mid or obfuscate:
}
}
page = es.search(
- index=dbname,
doc_type="mbox",
scroll = '30m',
search_type = 'scan',
@@ -239,11 +219,11 @@ if targetLID or makePrivate or makePublic or deleteEmails or mid or obfuscate:
if (count % 500 == 0):
print("Processed %u emails..." % count)
if not dryrun:
- helpers.bulk(es, js_arr)
+ es.bulk(js_arr)
js_arr = []
if len(js_arr) > 0:
if not dryrun:
- helpers.bulk(es, js_arr)
+ es.bulk(js_arr)
print("All done, processed %u docs in %u seconds" % (count, time.time() - then))
diff --git a/tools/list-lists.py b/tools/list-lists.py
index 6509efc..8fa9678 100755
--- a/tools/list-lists.py
+++ b/tools/list-lists.py
@@ -15,51 +15,36 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import sys
import time
-import configparser
import argparse
import json
-try:
- from elasticsearch import Elasticsearch
-except ImportError:
- print("Sorry, you need to install the elasticsearch module from pip first.")
- sys.exit(-1)
-
+from elastic import Elastic
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
+dbname=None
-makePublic = None
-makePrivate = None
-sourceLID = None
-targetLID = None
-deleteEmails = None
-wildcard = None
+parser = argparse.ArgumentParser(description='Command line options.')
+parser.add_argument('--dbname', dest='dbname', type=str, nargs=1,
+ help='Override index name')
+parser.add_argument('--pretty', dest='pretty', action='store_true',
+ help='Convert List IDs to email addresses')
+parser.add_argument('--debug', dest='debug', action='store_true',
+ help='Output the result JSON instead, very noisy!')
+parser.add_argument('--counts', dest='counts', action='store_true',
+ help='Show the count of messages for each list')
-ssl = False
-dbname = config.get("elasticsearch", "dbname")
-if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower() == 'true':
- ssl = True
-uri = ""
-if config.has_option("elasticsearch", "uri") and config.get("elasticsearch", "uri") != "":
- uri = config.get("elasticsearch", "uri")
-es = Elasticsearch([
- {
- 'host': config.get("elasticsearch", "hostname"),
- 'port': int(config.get("elasticsearch", "port")),
- 'use_ssl': ssl,
- 'url_prefix': uri
- }],
- max_retries=5,
- retry_on_timeout=True
- )
+args = parser.parse_args()
+
+if args.dbname:
+ dbname = args.dbname[0]
then = time.time()
+
+# get config and set up default database
+# If dbname is None, the config setting will be used
+es = Elastic(dbname=dbname)
+
page = es.search(
- index=dbname,
doc_type="mbox",
size = 0,
body = {
@@ -96,16 +81,6 @@ page = es.search(
}
)
-parser = argparse.ArgumentParser(description='Command line options.')
-parser.add_argument('--pretty', dest='pretty', action='store_true',
- help='Convert List IDs to email addresses')
-parser.add_argument('--debug', dest='debug', action='store_true',
- help='Output the result JSON instead, very noisy!')
-parser.add_argument('--counts', dest='counts', action='store_true',
- help='Show the count of messages for each list')
-
-args = parser.parse_args()
-pretty = args.pretty
plist = {}
total_private = 0
if args.debug:
@@ -116,7 +91,7 @@ else:
msgcount = domain['doc_count']
prvcount = domain['privacy']['doc_count']
total_private += prvcount
- if pretty:
+ if args.pretty:
if listid.find(".") != -1:
l, d = listid.strip("<>").split(".", 1)
plist[d] = plist[d] if d in plist else {}
diff --git a/tools/push-failures.py b/tools/push-failures.py
index a4c57b5..2fc6a67 100755
--- a/tools/push-failures.py
+++ b/tools/push-failures.py
@@ -18,39 +18,12 @@
""" Utility for retrying docs that we failed to index earlier.
"""
-import sys
-import configparser
import argparse
import json
import os
-import certifi
+from elastic import Elastic
-try:
- from elasticsearch import Elasticsearch
-except ImportError:
- print("Sorry, you need to install the elasticsearch module from pip first.")
- sys.exit(-1)
-
-
-# Fetch config
-config = configparser.RawConfigParser()
-config.read('ponymail.cfg')
-
-dbname = config.get("elasticsearch", "dbname")
-ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true'
-uri = config.get("elasticsearch", "uri", fallback="")
-
-es = Elasticsearch([
- {
- 'host': config.get("elasticsearch", "hostname"),
- 'port': int(config.get("elasticsearch", "port")),
- 'use_ssl': ssl,
- 'url_prefix': uri,
- 'ca_certs': certifi.where()
- }],
- max_retries=5,
- retry_on_timeout=True
- )
+es = Elastic()
parser = argparse.ArgumentParser(description='Command line options.')
# Cannot have both source and mid as input
@@ -61,6 +34,8 @@ args = parser.parse_args()
dumpDir = args.dumpdir if args.dumpdir else '.'
+print("Looking for *.json files in %s" % dumpDir)
+
files = [f for f in os.listdir(dumpDir) if os.path.isfile(os.path.join(dumpDir, f)) and f.endswith('.json')]
for f in files:
@@ -74,14 +49,12 @@ for f in files:
except KeyError:
mid = ojson['mbox']['mid']
es.index(
- index=dbname,
doc_type="mbox",
id=mid,
body = ojson['mbox']
)
es.index(
- index=dbname,
doc_type="mbox_source",
id=mid,
body = ojson['mbox_source']
@@ -90,7 +63,6 @@ for f in files:
if 'attachments' in ojson and ojson['attachments']:
for k, v in ojson['attachments'].items():
es.index(
- index=dbname,
doc_type="attachment",
id=k,
body = {
--
To stop receiving notification emails like this one, please contact
sebb@apache.org.