You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2021/09/20 16:04:11 UTC
[incubator-ponymail-foal] branch master updated: threadify
downloads to speed up imports
This is an automated email from the ASF dual-hosted git repository.
humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
The following commit(s) were added to refs/heads/master by this push:
new 42638f7 threadify downloads to speed up imports
42638f7 is described below
commit 42638f733361d05700eb97b8ecbc4c3f48c44e72
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Mon Sep 20 11:04:05 2021 -0500
threadify downloads to speed up imports
---
tools/import-mbox.py | 39 ++++++++++++++++++++++++++-------------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 44e752e..887b29a 100755
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -111,6 +111,24 @@ def bulk_insert(name, json, xes, dtype, wc="quorum"):
except Exception as err:
print("%s: Warning: Could not bulk insert: %s into %s" % (name, err, dtype))
+class DownloadThread(Thread):
+ def assign(self, url):
+ self.url = url
+ def run(self):
+ global lists
+ mldata = urlopen(self.url).read()
+ tmpfile = tempfile.NamedTemporaryFile(mode="w+b", buffering=1, delete=False)
+ try:
+ if ml.find(".gz") != -1:
+ mldata = gzip.decompress(mldata)
+ except Exception as err:
+ print("This wasn't a gzip file: %s" % err)
+ print(len(mldata))
+ tmpfile.write(mldata)
+ tmpfile.flush()
+ tmpfile.close()
+ lists.append([tmpfile.name, list_override])
+ print("Adding %s to slurp list as %s" % (self.url, tmpfile.name))
class SlurpThread(Thread):
def printid(self, message):
@@ -683,24 +701,19 @@ if re.match(r"https?://", source):
sys.exit(-1)
ns = r"href=\"(\d+(?:-[a-zA-Z]+)?\.txt(\.gz)?)\""
qn = 0
+
+ dl_threads = []
for mlist in re.finditer(ns, data):
ml = mlist.group(1)
- mldata = urlopen("%s%s" % (source, ml)).read()
- tmpfile = tempfile.NamedTemporaryFile(mode="w+b", buffering=1, delete=False)
- try:
- if ml.find(".gz") != -1:
- mldata = gzip.decompress(mldata)
- except Exception as err:
- print("This wasn't a gzip file: %s" % err)
- print(len(mldata))
- tmpfile.write(mldata)
- tmpfile.flush()
- tmpfile.close()
- lists.append([tmpfile.name, list_override])
- print("Adding %s/%s to slurp list as %s" % (source, ml, tmpfile.name))
+ dl_thread = DownloadThread()
+ dl_thread.assign("%s%s" % (source, ml))
+ dl_thread.start()
+ dl_threads.append(dl_thread)
qn += 1
if quickmode and qn >= 2:
break
+ for t in dl_threads:
+ t.join()
# IMAP(S) based import?
elif re.match(r"imaps?://", source):