You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/04/11 15:25:22 UTC

[lucene] branch main updated: LUCENE-9924: generate TLD list from IANA TLD db, rather than root zone db (#77)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new b0bd64c  LUCENE-9924: generate TLD list from IANA TLD db, rather than root zone db (#77)
b0bd64c is described below

commit b0bd64c62020383fe8c45be2035d346f7ce6174f
Author: Robert Muir <rm...@apache.org>
AuthorDate: Sun Apr 11 11:25:15 2021 -0400

    LUCENE-9924: generate TLD list from IANA TLD db, rather than root zone db (#77)
    
    This adds a bit of simplicity as the file is a simple domain list,
    rather than a DNS zone. So the regexes parsing DNS can be removed.
    
    Also the file may change less often as it contains JUST the list of
    TLDs, and not any additional DNS metadata.
---
 gradle/generation/jflex.gradle                     |  2 +-
 .../src/generated/checksums/generateTlds.json      |  6 ++--
 .../checksums/generateUAX29URLEmailTokenizer.json  |  4 +--
 .../apache/lucene/analysis/email/ASCIITLD.jflex    |  4 +--
 .../test/org/apache/lucene/analysis/email/TLDs.txt |  2 +-
 .../analysis/standard/GenerateJflexTLDMacros.java  | 41 ++++++++--------------
 6 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/gradle/generation/jflex.gradle b/gradle/generation/jflex.gradle
index b43e63c..66b0039 100644
--- a/gradle/generation/jflex.gradle
+++ b/gradle/generation/jflex.gradle
@@ -54,7 +54,7 @@ configure(project(":lucene:core")) {
 
 configure(project(":lucene:analysis:common")) {
   task generateTlds() {
-    def tldZones = "https://www.internic.net/zones/root.zone"
+    def tldZones = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
     def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
     def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
 
diff --git a/lucene/analysis/common/src/generated/checksums/generateTlds.json b/lucene/analysis/common/src/generated/checksums/generateTlds.json
index a361e36..dfb6fdf 100644
--- a/lucene/analysis/common/src/generated/checksums/generateTlds.json
+++ b/lucene/analysis/common/src/generated/checksums/generateTlds.json
@@ -1,4 +1,4 @@
 {
-    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b1af8dc8532d853fcf1acde1f6c629750b296b40",
-    "lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "b346a80f511b64a59d556eb3ef58cf396e98c631"
-}
+    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "41ecfd19595aaf19fe2ddffd7dadb26202e98fae",
+    "lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "1c5a201efff431be1c62150aa6bd3dac0f3a21e2"
+}
\ No newline at end of file
diff --git a/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json b/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json
index 269355c..191f06b 100644
--- a/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json
+++ b/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json
@@ -1,6 +1,6 @@
 {
     "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
-    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b1af8dc8532d853fcf1acde1f6c629750b296b40",
+    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "41ecfd19595aaf19fe2ddffd7dadb26202e98fae",
     "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "e437900d9570ca007f9c02c9ea286222b644c329",
     "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2"
-}
+}
\ No newline at end of file
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex
index 1d9b4b3..43e2a03 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex
@@ -14,8 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-// Generated from IANA Root Zone Database <https://www.internic.net/zones/root.zone>
-// file version from 2021 Apr 10, Sat 17:37:00 Coordinated Universal Time
+// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
+// file version from 2021 Apr 10, Sat 07:07:01 Coordinated Universal Time
 // generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
 
 // LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt
index d61f7f6..feff6de 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt
@@ -1,4 +1,4 @@
-# Generated from IANA Root Zone Database (gradlew generateTlds).aaa
+# Generated from IANA TLD Database (gradlew generateTlds).aaa
 aarp
 abarth
 abb
diff --git a/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java b/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java
index c8c2302..6e18daa 100644
--- a/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java
+++ b/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java
@@ -37,18 +37,15 @@ import java.util.SortedSet;
 import java.util.TimeZone;
 import java.util.TreeMap;
 import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /**
  * Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
  * inclusion in JFlex grammars that can accept domain names.
  *
- * <p>The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the response is
- * parsed, and the results are written out to a file containing a JFlex macro that will accept all
- * valid ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline
- * arg #1).
+ * <p>The IANA TLD Database is queried via HTTP from URL cmdline arg #0, the response is parsed, and
+ * the results are written out to a file containing a JFlex macro that will accept all valid
+ * ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline arg #1).
  */
 public class GenerateJflexTLDMacros {
 
@@ -100,9 +97,6 @@ public class GenerateJflexTLDMacros {
           + " */"
           + NL;
 
-  private static final Pattern TLD_PATTERN_1 = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
-  private static final Pattern TLD_PATTERN_2 =
-      Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
   private final URL tldFileURL;
   private long tldFileLastModified = -1L;
   private final Path tldListFile;
@@ -123,14 +117,14 @@ public class GenerateJflexTLDMacros {
   }
 
   /**
-   * Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then writes a set of JFlex
-   * macros accepting any of them case-insensitively out to the specified output file.
+   * Downloads the IANA TLD Database, extracts the ASCII TLDs, then writes a set of JFlex macros
+   * accepting any of them case-insensitively out to the specified output file.
    *
    * @throws IOException if there is a problem either downloading the database or writing out the
    *     output file.
    */
   public void execute() throws IOException {
-    getIANARootZoneDatabase();
+    getIANATLDDatabase();
     partitionTLDprefixesBySuffixLength();
     writeOutput();
     System.out.println("Wrote TLD macros to '" + jflexMacroFile + "':");
@@ -145,11 +139,11 @@ public class GenerateJflexTLDMacros {
   }
 
   /**
-   * Downloads the IANA Root Zone Database.
+   * Downloads the IANA TLD Database.
    *
    * @throws java.io.IOException if there is a problem downloading the database
    */
-  private void getIANARootZoneDatabase() throws IOException {
+  private void getIANATLDDatabase() throws IOException {
     final URLConnection connection = tldFileURL.openConnection();
     connection.setUseCaches(false);
     connection.addRequestProperty("Cache-Control", "no-cache");
@@ -160,23 +154,16 @@ public class GenerateJflexTLDMacros {
             new InputStreamReader(connection.getInputStream(), StandardCharsets.US_ASCII))) {
       String line;
       while (null != (line = reader.readLine())) {
-        Matcher matcher = TLD_PATTERN_1.matcher(line);
-        if (matcher.matches()) {
-          // System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
-          processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
-        } else {
-          matcher = TLD_PATTERN_2.matcher(line);
-          if (matcher.matches()) {
-            // System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
-            processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
-          }
+        if (line.startsWith("#")) {
+          continue;
         }
+        processedTLDsLongestFirst.put(line.toLowerCase(Locale.ROOT), Boolean.FALSE);
       }
     }
     System.out.println(
         "Found "
             + processedTLDsLongestFirst.size()
-            + " TLDs in IANA Root Zone Database at "
+            + " TLDs in IANA TLD Database at "
             + tldFileURL);
   }
 
@@ -223,7 +210,7 @@ public class GenerateJflexTLDMacros {
   private void writeOutput() throws IOException {
     Files.writeString(
         tldListFile,
-        "# Generated from IANA Root Zone Database (gradlew generateTlds)."
+        "# Generated from IANA TLD Database (gradlew generateTlds)."
             + processedTLDsLongestFirst.keySet().stream()
                 .sorted()
                 .collect(Collectors.joining("\n")),
@@ -235,7 +222,7 @@ public class GenerateJflexTLDMacros {
     try (Writer writer =
         new OutputStreamWriter(Files.newOutputStream(jflexMacroFile), StandardCharsets.UTF_8)) {
       writer.write(APACHE_LICENSE);
-      writer.write("// Generated from IANA Root Zone Database <");
+      writer.write("// Generated from IANA TLD Database <");
       writer.write(tldFileURL.toString());
       writer.write(">");
       writer.write(NL);