You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2020/09/11 08:39:47 UTC
[incubator-ponymail-foal] 01/03: convert thread construction to a
class, add addl subject hash map for speedups
This is an automated email from the ASF dual-hosted git repository.
humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
commit b08d25a40ee6533bdaa772da00b4d0c6950f2f18
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Fri Sep 11 10:38:49 2020 +0200
convert thread construction to a class, add addl subject hash map for speedups
---
server/plugins/mbox.py | 104 +++++++++++++++++++++++++------------------------
1 file changed, 54 insertions(+), 50 deletions(-)
diff --git a/server/plugins/mbox.py b/server/plugins/mbox.py
index 6dcd562..329bb92 100644
--- a/server/plugins/mbox.py
+++ b/server/plugins/mbox.py
@@ -439,57 +439,61 @@ async def get_years(session, query_defuzzed):
return oldest, youngest
-def find_root_subject(threads, hashdict, root_email, osubject=None):
- """Finds the discussion origin of an email, if present"""
- irt = root_email.get("in-reply-to")
- subject = root_email.get("subject")
- subject = subject.replace("\n", "") # Crop multi-line subjects
-
- # First, the obvious - look for an in-reply-to in our existing dict with a matching subject
- if irt and irt in hashdict:
- if hashdict[irt].get("subject") == subject:
- return hashdict[irt]
-
- # If that failed, we break apart our subject
- if osubject:
- rsubject = osubject
- else:
- rsubject = PYPONY_RE_PREFIX.sub("", subject) + "_" + root_email.get("list_raw")
- for thread in threads:
- if thread.get("tsubject") == rsubject:
- return thread
-
- return None
-
-
-def construct_threads(emails: typing.List[typing.Dict]):
- """Turns a list of emails into a nested thread structure"""
- threads: typing.List[dict] = []
- authors = {}
- hashdict: typing.Dict[str, dict] = {}
- for cur_email in sorted(emails, key=lambda x: x["epoch"]):
- author = cur_email.get("from")
- if author not in authors:
- authors[author] = 0
- authors[author] += 1
- subject = cur_email.get("subject", "").replace("\n", "") # Crop multi-line subjects
- tsubject = PYPONY_RE_PREFIX.sub("", subject) + "_" + cur_email.get("list_raw", "<a.b.c.d>")
- parent = find_root_subject(threads, hashdict, cur_email, tsubject)
- xemail = {
- "children": [],
- "tid": cur_email.get("mid"),
- "subject": subject,
- "tsubject": tsubject,
- "epoch": cur_email.get("epoch"),
- "nest": 1,
- }
- if not parent:
- threads.append(xemail)
+class ThreadConstructor:
+ def __init__(self, emails: typing.List[typing.Dict]):
+ self.emails = emails
+ self.threads: typing.List[dict] = []
+ self.authors = {}
+ self.hashed_by_msg_id: typing.Dict[str, dict] = {}
+ self.hashed_by_subject: typing.Dict[str, dict] = {}
+
+ def construct(self):
+ """Turns a flat array of emails into a nested structure of threads"""
+ for cur_email in sorted(self.emails, key=lambda x: x["epoch"]):
+ author = cur_email.get("from")
+ if author not in self.authors:
+ self.authors[author] = 0
+ self.authors[author] += 1
+ subject = cur_email.get("subject", "").replace("\n", "") # Crop multi-line subjects
+ tsubject = PYPONY_RE_PREFIX.sub("", subject) + "_" + cur_email.get("list_raw", "<a.b.c.d>")
+ parent = self.find_root_subject(cur_email, tsubject)
+ xemail = {
+ "children": [],
+ "tid": cur_email.get("mid"),
+ "subject": subject,
+ "tsubject": tsubject,
+ "epoch": cur_email.get("epoch"),
+ "nest": 1,
+ }
+ if not parent:
+ self.threads.append(xemail)
+ else:
+ xemail["nest"] = parent["nest"] + 1
+ parent["children"].append(xemail)
+ self.hashed_by_msg_id[cur_email.get("message-id", "??")] = xemail
+ if tsubject not in self.hashed_by_subject:
+ self.hashed_by_subject[tsubject] = xemail
+ return self.threads, self.authors
+
+ def find_root_subject(self, root_email, osubject=None):
+ """Finds the discussion origin of an email, if present"""
+ irt = root_email.get("in-reply-to")
+ subject = root_email.get("subject")
+ subject = subject.replace("\n", "") # Crop multi-line subjects
+
+ # First, the obvious - look for an in-reply-to in our existing dict with a matching subject
+ if irt and irt in self.hashed_by_msg_id:
+ if self.hashed_by_msg_id[irt].get("subject") == subject:
+ return self.hashed_by_msg_id[irt]
+
+ # If that failed, we break apart our subject
+ if osubject:
+ rsubject = osubject
else:
- xemail["nest"] = parent["nest"] + 1
- parent["children"].append(xemail)
- hashdict[cur_email.get("message-id", "??")] = xemail
- return threads, authors
+ rsubject = PYPONY_RE_PREFIX.sub("", subject) + "_" + root_email.get("list_raw")
+ if rsubject and rsubject in self.hashed_by_subject:
+ return self.hashed_by_subject[rsubject]
+ return None
def gravatar(eml):