You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2021/09/20 16:04:11 UTC
[incubator-ponymail-foal] branch master updated: threadify downloads to speed up imports

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git


The following commit(s) were added to refs/heads/master by this push:
     new 42638f7  threadify downloads to speed up imports
42638f7 is described below

commit 42638f733361d05700eb97b8ecbc4c3f48c44e72
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Mon Sep 20 11:04:05 2021 -0500

    threadify downloads to speed up imports
---
 tools/import-mbox.py | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 44e752e..887b29a 100755
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -111,6 +111,24 @@ def bulk_insert(name, json, xes, dtype, wc="quorum"):
     except Exception as err:
         print("%s: Warning: Could not bulk insert: %s into %s" % (name, err, dtype))
 
+class DownloadThread(Thread):
+    def assign(self, url):
+        self.url = url
+    def run(self):
+        global lists
+        mldata = urlopen(self.url).read()
+        tmpfile = tempfile.NamedTemporaryFile(mode="w+b", buffering=1, delete=False)
+        try:
+            if ml.find(".gz") != -1:
+                mldata = gzip.decompress(mldata)
+        except Exception as err:
+            print("This wasn't a gzip file: %s" % err)
+        print(len(mldata))
+        tmpfile.write(mldata)
+        tmpfile.flush()
+        tmpfile.close()
+        lists.append([tmpfile.name, list_override])
+        print("Adding %s to slurp list as %s" % (self.url, tmpfile.name))
 
 class SlurpThread(Thread):
     def printid(self, message):
@@ -683,24 +701,19 @@ if re.match(r"https?://", source):
             sys.exit(-1)
         ns = r"href=\"(\d+(?:-[a-zA-Z]+)?\.txt(\.gz)?)\""
         qn = 0
+        
+        dl_threads = []
         for mlist in re.finditer(ns, data):
             ml = mlist.group(1)
-            mldata = urlopen("%s%s" % (source, ml)).read()
-            tmpfile = tempfile.NamedTemporaryFile(mode="w+b", buffering=1, delete=False)
-            try:
-                if ml.find(".gz") != -1:
-                    mldata = gzip.decompress(mldata)
-            except Exception as err:
-                print("This wasn't a gzip file: %s" % err)
-            print(len(mldata))
-            tmpfile.write(mldata)
-            tmpfile.flush()
-            tmpfile.close()
-            lists.append([tmpfile.name, list_override])
-            print("Adding %s/%s to slurp list as %s" % (source, ml, tmpfile.name))
+            dl_thread = DownloadThread()
+            dl_thread.assign("%s%s" % (source, ml))
+            dl_thread.start()
+            dl_threads.append(dl_thread)
             qn += 1
             if quickmode and qn >= 2:
                 break
+        for t in dl_threads:
+            t.join()
 
 # IMAP(S) based import?
 elif re.match(r"imaps?://", source):