You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by hu...@apache.org on 2021/09/12 13:55:38 UTC
[incubator-ponymail-foal] branch master updated: switch to using textlib.py for normalize_lid

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git


The following commit(s) were added to refs/heads/master by this push:
     new e123412  switch to using textlib.py for normalize_lid
e123412 is described below

commit e123412e3ef0cf9421703b8a1a8daf73632f05ce
Author: Daniel Gruno <hu...@apache.org>
AuthorDate: Sun Sep 12 08:55:33 2021 -0500

    switch to using textlib.py for normalize_lid
---
 tools/migrate.py | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/tools/migrate.py b/tools/migrate.py
index 945af57..b24711d 100644
--- a/tools/migrate.py
+++ b/tools/migrate.py
@@ -23,9 +23,9 @@ from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
 from elasticsearch.helpers import async_scan
 
 if not __package__:
-    from plugins import generators
+    from plugins import generators, textlib
 else:
-    from .plugins import generators
+    from .plugins import generators, textlib
 
 import argparse
 import base64
@@ -44,26 +44,6 @@ MIGRATION_MAGIC_NUMBER = "2"
 cores = len(os.sched_getaffinity(0))
 MAX_PARALLEL_OPS = max(min(int((cores + 1) * 0.75), cores - 1), 1)
 
-def normalize_lid(lid: str) -> str:
-    """ Ensures that a List ID is in standard form, i.e. <a.b.c.d> """
-    # If of format "list name" <foo.bar.baz>
-    # we crop away the description (#511)
-    m = re.match(r'".*"\s+(.+)', lid)
-    if m:
-        lid = m.group(1)
-    # Drop <> and anything before/after, if found
-    m = re.search(r"<(.+)>", lid)
-    if m:
-        lid = m.group(1)
-    # Belt-and-braces: remove possible extraneous chars
-    lid = "<%s>" % lid.strip(" <>").replace("@", ".")
-    # Replace invalid characters with underscores so as to not invalidate doc IDs.
-    lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
-    # Finally, ensure we have a loosely valid list ID value
-    if not re.match(r"^<.+\..+>$", lid):
-        print("Warning: Invalid list-id %s" % lid)
-    return lid
-
 class MultiDocProcessor:
     """MultiProcess document processor"""
 
@@ -193,7 +173,7 @@ def bulk_push(json, es, graceful=False):
 
 def process_document(old_es, doc, old_dbname, dbname_source, dbname_mbox, do_dkim):
     now = time.time()
-    list_id = normalize_lid(doc["_source"]["list_raw"])
+    list_id = textlib.normalize_lid(doc["_source"]["list_raw"])
     try:
         source = old_es.get(index=old_dbname, doc_type="mbox_source", id=doc["_id"])
         # If we hit a 404 on a source, we have to fake an empty document, as we don't know the source.