You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/14 09:37:00 UTC

svn commit: r1865102 - in /spamassassin/branches/3.4: lib/Mail/SpamAssassin/PerMsgStatus.pm t/uri_text.t

Author: hege
Date: Wed Aug 14 09:37:00 2019
New Revision: 1865102

URL: http://svn.apache.org/viewvc?rev=1865102&view=rev
Log:
Commit all uri parser changes from trunk to 3.4

Modified:
    spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/branches/3.4/t/uri_text.t

Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865102&r1=1865101&r2=1865102&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 14 09:37:00 2019
@@ -268,6 +268,7 @@ sub new {
     'async'             => Mail::SpamAssassin::AsyncLoop->new($main),
     'master_deadline'   => $msg->{master_deadline},  # dflt inherited from msg
     'deadline_exceeded' => 0,  # time limit exceeded, skipping further tests
+    'uri_detail_list'   => { },
   };
 
   dbg("check: pms new, time limit in %.3f s",
@@ -912,14 +913,13 @@ sub get_content_preview {
   my ($self) = @_;
 
   my $str = '';
-  my $ary = $self->get_decoded_stripped_body_text_array();
-  shift @{$ary};                # drop the subject line
+  my @ary = @{$self->get_decoded_stripped_body_text_array()};
+  shift @ary;                # drop the subject line
 
   my $numlines = 3;
-  while (length ($str) < 200 && @{$ary} && $numlines-- > 0) {
-    $str .= shift @{$ary};
+  while (length ($str) < 200 && @ary && $numlines-- > 0) {
+    $str .= shift @ary;
   }
-  undef $ary;
 
   # in case the last line was huge, trim it back to around 200 chars
   local $1;
@@ -2132,12 +2132,12 @@ sub _tbirdurire {
   my ($self) = @_;
 
   # Cached?
-  return $self->{tbirdurire} if $self->{tbirdurire};
+  return $self->{tbirdurire} if exists $self->{tbirdurire};
 
   # a hybrid of tbird and oe's  version of uri parsing
-  my $tbirdstartdelim = '><"\'`,{[(|\s'  . "\x1b";  # The \x1b as per bug 4522
+  my $tbirdstartdelim = '><"\'`,{[(|\s'  . "\x1b\xa0";  # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
   my $iso2022shift = "\x1b" . '\(.';  # bug 4522
-  my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b";  # The \x1b as per bug 4522
+  my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b\xa0";  # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
   my $nonASCII    = '\x80-\xff';
 
   # bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
@@ -2180,18 +2180,18 @@ sub get_uri_list {
   my ($self) = @_;
 
   # use cached answer if available
-  if (defined $self->{uri_list}) {
+  if (exists $self->{uri_list}) {
     return @{$self->{uri_list}};
   }
 
-  my @uris;
+  my %uris;
   # $self->{redirect_num} = 0;
 
-  # get URIs from HTML parsing
+  # get URIs from text/HTML parsing
   while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
     if ($info->{cleaned}) {
       foreach (@{$info->{cleaned}}) {
-        push(@uris, $_);
+        $uris{$_} = 1;
 
         # count redirection attempts and log it
         # if (my @http = m{\b(https?:/{0,2})}gi) {
@@ -2201,10 +2201,10 @@ sub get_uri_list {
     }
   }
 
-  $self->{uri_list} = \@uris;
+  @{$self->{uri_list}} = keys %uris;
 # $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris)  if @uris;
 
-  return @uris;
+  return @{$self->{uri_list}};
 }
 
 =item $status->get_uri_detail_list ()
@@ -2214,24 +2214,30 @@ various data about where the URIs were f
 combination of the URIs found in the rendered (decoded and HTML stripped)
 body and the URIs found when parsing the HTML in the message.  Will also
 set $status->{uri_detail_list} (the hash reference as returned by this
-function).  This function will also set $status->{uri_domain_count} (count of
-unique domains).
+function).
 
 The hash format looks something like this:
 
   raw_uri => {
-    types => { a => 1, img => 1, parsed => 1 },
+    types => { a => 1, img => 1, parsed => 1, domainkeys => 1,
+               unlinked => 1, schemeless => 1 },
     cleaned => [ canonicalized_uri ],
     anchor_text => [ "click here", "no click here" ],
     domains => { domain1 => 1, domain2 => 1 },
+    hosts => { host1 => domain1, host2 => domain2 },
   }
 
 C<raw_uri> is whatever the URI was in the message itself
-(http://spamassassin.apache%2Eorg/).
+(http://spamassassin.apache%2Eorg/).  Uris parsed from text will be prefixed
+with scheme if missing (http://, mailto: etc).  HTML uris are as found.
 
-C<types> is a hash of the HTML tags (lowercase) which referenced
-the raw_uri.  I<parsed> is a faked type which specifies that the
-raw_uri was seen in the rendered text.
+C<types> is a hash of the HTML tags (lowercase) which referenced the
+raw_uri.  I<parsed> is a faked type which specifies that the raw_uri was
+seen in the rendered text.  I<domainkeys> is defined when raw_uri was found
+from DK/DKIM d= field.  I<unlinked> is defined when it's assumed that MUA
+will not linkify uri (found in body without scheme or www. prefix). 
+I<schemeless> is always added for uris without scheme, regardless of
+linkifying (i.e. email address found in body without mailto:).
 
 C<cleaned> is an array of the raw and canonicalized version of the raw_uri
 (http://spamassassin.apache%2Eorg/, https://spamassassin.apache.org/).
@@ -2249,267 +2255,255 @@ as hash keys, with their domain part sto
 sub get_uri_detail_list {
   my ($self) = @_;
 
-  # use cached answer if available
-  if (defined $self->{uri_detail_list}) {
+  # process only once, use unique uri_detail_list_run flag,
+  # in case add_uri_detail_list has already been called
+  if ($self->{uri_detail_list_run}) {
     return $self->{uri_detail_list};
   }
+  $self->{uri_detail_list_run} = 1;
 
   my $timer = $self->{main}->time_method("get_uri_detail_list");
 
-  $self->{uri_domain_count} = 0;
-
-  # do this so we're sure metadata->html is setup
-  my %parsed = map { $_ => 'parsed' } $self->_get_parsed_uri_list();
-
+  # process text parsed uris
+  $self->_process_text_uri_list();
+  # process html uris
+  $self->_process_html_uri_list();
+  # process dkim uris
+  $self->_process_dkim_uri_list();
+
+  return $self->{uri_detail_list};
+}
+
+sub _process_text_uri_list {
+  my ($self) = @_;
+
+  # Use decoded stripped body, which does not contain HTML
+  my $textary = $self->get_decoded_stripped_body_text_array();
+  my $tbirdurire = $self->_tbirdurire;
+  my %seen;
+  my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
+
+  foreach my $text (@$textary) {
+    # a workaround for [perl #69973] bug:
+    # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
+    # Bug 6225, regexp and string should both be utf8, or none of them;
+    # untainting string also seems to avoid the crash
+    #
+    # Bug 6225: untaint the string in an attempt to work around a perl crash
+    local $_ = untaint_var($text);
+
+    local($1,$2,$3);
+    while (/$tbirdurire/igo) {
+      my $rawuri = $1||$2||$3;
+      my $schost = $4;
+      my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
+      $rawuri =~ s/(^[^(]*)\).*$/$1/;  # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
+      $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
+
+      next if exists $seen{$rawuri};
+      $seen{$rawuri} = 1;
+
+      dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
+
+      # Quick ignore if schemeless host not valid
+      next if defined $schost && !is_fqdn_valid($schost, 1);
+
+      # Ignore cid: mid: as they can be mistaken for emails,
+      # these should not be parsed from stripped body in any case.
+      # Example: [cid:image001.png@01D4986E.E3459640]
+      next if $rawuri =~ /^[cm]id:/i;
+
+      # Ignore empty uris
+      next if $rawuri =~ /^\w+:\/{0,2}$/i;
+
+      my $types = {parsed => 1};
+
+      # If it's a hostname that was just sitting out in the
+      # open, without a protocol, and not inside of an HTML tag,
+      # the we should add the proper protocol in front, rather
+      # than using the base URI.
+      my $uri = $rawuri;
+      if ($uri !~ /^(?:https?|ftp|mailto):/i) {
+        if ($uri =~ /^ftp\./i) {
+          $uri = "ftp://$uri";
+        }
+        elsif ($uri =~ /^www\d{0,2}\./i) {
+          $uri = "http://$uri";
+        }
+        elsif (index($uri, '@') != -1) {
+          # Ignore schemeless emails without valid tld, matches crap like
+          # Vi@gra. No urldecoding is done for tld test which is fine.
+          # This is not linkified by MUAs: foo@bar%2Ecom
+          # This IS linkified: foo@bar%2Ebar.com
+          # And this is linkified: foo@bar%2Ecom?foo.com&bar  (woot??)
+          # And this is linkified with Outlook: foo@bar%2Ecom&foo  (woot??)
+          # Don't test when ? or & exists, canonicalizing will handle later.
+          if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
+            next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
+          }
+          next if index($uri, '&nbsp;') != -1; # ignore garbled
+          $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
+          # Urldecode now
+          $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+          $uri = "mailto:$uri";
+        }
+        else {
+          # some spammers are using unschemed URIs to escape filters
+          # flag that this is a URI that MUAs don't linkify so only use for RBLs
+          # (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
+          $uri = "http://$uri";
+          $types->{unlinked} = 1;
+        }
+        # Mark any of those schemeless
+        $types->{schemeless} = 1;
+      }
+      elsif ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
+        # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
+        $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+        # Skip unless @ found after decoding, then check tld is valid
+        next unless $uri =~ /\@([^?&>]*)/;
+        next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
+      }
 
-  # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
-  # this functionality
-  # 2013-01-07
-  # This functionality is re-enabled as a configuration option disabled by
-  # default (bug 7087)
-  # 2014-10-06
+      dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
 
-  # Look for the domain in DK/DKIM headers
-  if ( $self->{conf}->{parse_dkim_uris} ) {
-    my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef),
-                                        $self->get('DKIM-Signature',undef) ));
-    while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
-      my $dom = $1;
-      $dom =~ s/\s+//g;
-      next if !is_fqdn_valid($dom);
-      next if !$self->{main}->{registryboundaries}->is_domain_valid($dom);
-      $parsed{$dom} = 'domainkeys';
+      $self->add_uri_detail_list($uri, $types, 'parsed', 1);
     }
   }
+}
+
+sub _process_html_uri_list {
+  my ($self) = @_;
 
   # get URIs from HTML parsing
   # use the metadata version since $self->{html} may not be setup
   my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
   $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
 
-  # don't keep dereferencing ...
-  my $redirector_patterns = $self->{conf}->{redirector_patterns};
-
   # canonicalize the HTML parsed URIs
   while(my($uri, $info) = each %{ $detail }) {
-    my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
-    $info->{cleaned} = \@tmp;
-
-    foreach (@tmp) {
-      my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
-      if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
-        # unstripped full host name as a key, and its domain part as a value
-        $info->{hosts}->{$host} = $domain;
-        if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}) {
-          $info->{domains}->{$domain} = 1;  # stripped to domain boundary
-          $self->{uri_domain_count}++;
-        }
-      }
-    }
-
-    if (would_log('dbg', 'uri') == 2) {
-      dbg("uri: html uri found, $uri");
-      foreach my $nuri (@tmp) {
-        dbg("uri: cleaned html uri, $nuri");
-      }
-      if ($info->{hosts} && $info->{domains}) {
-        for my $host (keys %{$info->{hosts}}) {
-          dbg("uri: html host %s, domain %s", $host, $info->{hosts}->{$host});
+    if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
+      # Need also to copy and uniq anchor text
+      if (exists $info->{anchor_text}) {
+        my %seen;
+        foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
+          push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
         }
       }
     }
   }
+}
 
-  # canonicalize the text parsed URIs
-  while (my($uri, $type) = each %parsed) {
-    $detail->{$uri}->{types}->{$type} = 1;
-    my $info = $detail->{$uri};
-
-    my @uris;
-
-    if (!exists $info->{cleaned}) {
-      if ($type eq 'parsed') {
-        @uris = uri_list_canonicalize($redirector_patterns, $uri);
-      }
-      else {
-        @uris = ( $uri );
-      }
-      $info->{cleaned} = \@uris;
+sub _process_dkim_uri_list {
+  my ($self) = @_;
 
-      foreach (@uris) {
-        my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
-        if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
-          # unstripped full host name as a key, and its domain part as a value
-          $info->{hosts}->{$host} = $domain;
-          if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}){
-            $info->{domains}->{$domain} = 1;
-            $self->{uri_domain_count}++;
-          }
-        }
-      }
-    }
+  # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
+  # this functionality
+  # 2013-01-07
+  # This functionality is re-enabled as a configuration option disabled by
+  # default (bug 7087)
+  # 2014-10-06
 
-    if (would_log('dbg', 'uri') == 2) {
-      dbg("uri: parsed uri found of type $type, $uri");
-      foreach my $nuri (@uris) {
-        dbg("uri: cleaned parsed uri, $nuri");
-      }
-      if ($info->{hosts} && $info->{domains}) {
-        for my $host (keys %{$info->{hosts}}) {
-          dbg("uri: parsed host %s, domain %s", $host, $info->{hosts}->{$host});
-        }
-      }
+  # Look for the domain in DK/DKIM headers
+  if ($self->{conf}->{parse_dkim_uris}) {
+    my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef ),
+                                        $self->get('DKIM-Signature',undef) ));
+    while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
+      my $d = $1;
+      $d =~ s/\s+//g;
+      # prefix with domainkeys: so it doesn't merge with identical keys
+      $self->add_uri_detail_list("domainkeys:$d",
+        {'domainkeys'=>1, 'nocanon'=>1, 'noclean'=>1},
+        'domainkeys', 1);
     }
   }
+}
 
-  # setup the cache
-  $self->{uri_detail_list} = $detail;
+=item $status->add_uri_detail_list ($raw_uri, $types, $source, $valid_domain)
 
-  return $detail;
-}
+Adds values to internal uri_detail_list.  When used from Plugins, recommended
+to call from parsed_metadata (along with register_method_priority, -10) so
+other Plugins calling get_uri_detail_list() will see it.
 
-sub _get_parsed_uri_list {
-  my ($self) = @_;
+C<raw_uri> is the URI to be added. The only required parameter.
 
-  # use cached answer if available
-  unless (defined $self->{parsed_uri_list}) {
-    # TVD: we used to use decoded_body which is fine, except then we'll
-    # try parsing URLs out of HTML, which is what the HTML code is going
-    # to do (note: we know the HTML parsing occurs, because we call for the
-    # rendered text which does HTML parsing...)  trying to get URLs out of
-    # HTML w/out parsing causes issues, so let's not do it.
-    # also, if we allow $textary to be passed in, we need to invalidate
-    # the cache first. fyi.
-    my $textary = $self->get_decoded_stripped_body_text_array();
-    my $redirector_patterns = $self->{conf}->{redirector_patterns};
-
-    my ($rulename, $pat, @uris);
-    my $text;
-    my $tbirdurire = $self->_tbirdurire;
-    my %seen;
-    my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
-
-    foreach my $entry (@$textary) {
-
-      # a workaround for [perl #69973] bug:
-      # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
-      # Bug 6225, regexp and string should both be utf8, or none of them;
-      # untainting string also seems to avoid the crash
-      #
-      # Bug 6225: untaint the string in an attempt to work around a perl crash
-      local $_ = untaint_var($entry);
+C<types> is an optional hash reference, contents are added to
+uri_detail_list->{types} (see get_uri_detail_list for known keys). 
+I<parsed> is default is no hash given.  I<nocanon> does not run
+uri_list_canonicalize (no redirector, uri fixing).  I<noclean> skips adding
+uri_detail_list->{cleaned}, so it would not be used in "uri" rule checks,
+but domain/hosts would still be used for URIBL/RBL purposes.
 
-      local($1,$2,$3);
-      while (/$tbirdurire/igo) {
-        my $rawuri = $1||$2||$3;
-        my $schost = $4;
-        my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
-        $rawuri =~ s/(^[^(]*)\).*$/$1/;  # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
-        $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
-
-        next if exists $seen{$rawuri};
-        $seen{$rawuri} = 1;
-
-        dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
-
-        # Quick ignore if schemeless host not valid
-        next if defined $schost && !is_fqdn_valid($schost);
-
-        # Ignore cid: mid: as they can be mistaken for emails,
-        # these should not be parsed from stripped body in any case.
-        # Example: [cid:image001.png@01D4986E.E3459640]
-        next if $rawuri =~ /^[cm]id:/i;
-
-        # Ignore empty uris
-        next if $rawuri =~ /^\w+:\/{0,2}$/i;
-
-        # skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
-        next if $rawuri =~ m{^(?:(?:https?|ftp|mailto):(?://)?)?(?:[^\@/?#]*\@)?[^/?#:]*\.\.}i;
-
-        # If it's a hostname that was just sitting out in the
-        # open, without a protocol, and not inside of an HTML tag,
-        # the we should add the proper protocol in front, rather
-        # than using the base URI.
-        my $uri = $rawuri;
-        my $rblonly;
-        if ($uri !~ /^(?:https?|ftp|mailto|javascript|file):/i) {
-          if ($uri =~ /^ftp\./i) {
-            $uri = "ftp://$uri";
-          }
-          elsif ($uri =~ /^www\d{0,2}\./i) {
-            $uri = "http://$uri";
-          }
-          elsif (index($uri, '@') != -1) {
-            # Ignore schemeless emails without valid tld, matches crap like
-            # Vi@gra. No urldecoding is done for tld test which is fine.
-            # This is not linkified by MUAs: foo@bar%2Ecom
-            # This IS linkified: foo@bar%2Ebar.com
-            # And this is linkified: foo@bar%2Ecom?foo.com&bar  (woot??)
-            # And this is linkified with Outlook: foo@bar%2Ecom&foo  (woot??)
-            # Don't test when ? or & exists, canonicalizing will handle later.
-            if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
-              next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
-            }
-            next if index($uri, '&nbsp;') != -1; # ignore garbled
-            $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
-            $uri = "mailto:$uri";
-          }
-          else {
-            # some spammers are using unschemed URIs to escape filters
-            $rblonly = 1;    # flag that this is a URI that MUAs don't linkify so only use for RBLs
-            $uri = "http://$uri";
-          }
-        }
+C<source> is an optional simple string, only used for debug logging purposes
+to identify where uri originates from (default: "parsed").
 
-        if ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
-          # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
-          $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
-          # Skip unless @ found after decoding, then check tld is valid
-          next unless $uri =~ /\@([^?&>]*)/;
-          next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
-          # SA 3.4 legacy code continues
-          my $domuri = $self->{main}->{registryboundaries}->uri_to_domain($uri);
-          next unless $domuri;
-          push (@uris, $rawuri);
-          push (@uris, $uri) unless ($rawuri eq $uri);
-        }
+C<valid_domain> is an optional boolean (0/1).  If true, uri will not be
+added unless hostname/domain is in valid format and contains a valid TLD. 
+(default: 0)
 
-        next unless ($uri =~/^(?:https?|ftp):/i);  # at this point only valid if one or the other of these
+=cut
 
-        my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
-        my $goodurifound = 0;
-        foreach my $cleanuri (@tmp) {
-          my $domain = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
-          if ($domain) {
-            # bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
-            if ($rblonly) {
-              local $1;
-              $cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/i;
-            }
-            push (@uris, $cleanuri);
-            $goodurifound = 1;
-          }
-        }
-        next unless $goodurifound;
-        push @uris, $rawuri unless $rblonly;
-      }
-    }
+sub add_uri_detail_list {
+  my ($self, $uri, $types, $source, $valid_domain) = @_;
+
+  $types = {'parsed' => 1} unless defined $types;
+  $source ||= 'parsed';
 
+  my (%domains, %hosts, %cleaned);
+  my $udl = $self->{uri_detail_list};
+
+  dbg("uri: canonicalizing $source uri: $uri");
+
+  my @uris;
+  if ($types->{nocanon}) {
+    push @uris, $uri;
+  } else {
+    @uris = uri_list_canonicalize($self->{conf}->{redirector_patterns}, $uri);
+  }
+  foreach my $cleanuri (@uris) {
     # Make sure all the URIs are nice and short
-    foreach my $uri ( @uris ) {
-      if (length $uri > MAX_URI_LENGTH) {
-        $self->{'uri_truncated'} = 1;
-        $uri = substr $uri, 0, MAX_URI_LENGTH;
-      }
+    if (length($cleanuri) > MAX_URI_LENGTH) {
+      $self->{'uri_truncated'} = 1;
+      $cleanuri = substr($cleanuri, 0, MAX_URI_LENGTH);
+    }
+    dbg("uri: cleaned uri: $cleanuri");
+    $cleaned{$cleanuri} = 1;
+    my ($domain, $host) = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
+    if (defined $domain) {
+      dbg("uri: added host: $host domain: $domain");
+      $domains{$domain} = 1;
+      $hosts{$host} = $domain;
     }
+  }
+
+  # Bail out if no good uri found
+  return unless %cleaned;
+
+  # Bail out if no domains/hosts found?
+  return if $valid_domain && !%domains;
 
-    # setup the cache and return
-    $self->{parsed_uri_list} = \@uris;
+  # Merge cleaned
+  if (!$types->{noclean}) {
+    if ($udl->{$uri}->{cleaned}) {
+      $cleaned{$_} = 1 foreach (@{$udl->{$uri}->{cleaned}});
+    }
+    @{$udl->{$uri}->{cleaned}} = keys %cleaned;
   }
 
-  return @{$self->{parsed_uri_list}};
+  # Domains/hosts (there might not be any)
+  $udl->{$uri}->{domains}->{$_} = 1 foreach keys %domains;
+  $udl->{$uri}->{hosts}->{$_} = $hosts{$_} foreach keys %hosts;
+
+  # Types
+  $udl->{$uri}->{types}->{$_} = 1 foreach keys %$types;
+
+  # Invalidate uri_list cache
+  delete $self->{uri_list};
+
+  return 1;
 }
 
+
 ###########################################################################
 
 sub ensure_rules_are_complete {
@@ -2526,13 +2520,15 @@ sub ensure_rules_are_complete {
 
     my $start = time;
     $self->harvest_until_rule_completes($r);
-    my $elapsed = time - $start;
+    my $elapsed = sprintf "%.2f", time - $start;
 
     if (!$self->is_rule_complete($r)) {
       dbg("rules: rule $r is still not complete; exited early?");
     }
     elsif ($elapsed > 0) {
-      info("rules: $r took $elapsed seconds to complete, for $metarule");
+      my $txt = "rules: $r took $elapsed seconds to complete, for $metarule";
+      # Info only if something took over 1 sec to wait, prevent log flood
+      if ($elapsed >= 1) { info($txt); } else { dbg($txt); }
     }
   }
 }

Modified: spamassassin/branches/3.4/t/uri_text.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/t/uri_text.t?rev=1865102&r1=1865101&r2=1865102&view=diff
==============================================================================
--- spamassassin/branches/3.4/t/uri_text.t (original)
+++ spamassassin/branches/3.4/t/uri_text.t Wed Aug 14 09:37:00 2019
@@ -20,7 +20,7 @@ if (-e 'test_dir') {            # runnin
 use strict;
 use lib '.'; use lib 't';
 use SATest; sa_t_init("uri_text");
-use Test::More tests => 685;
+use Test::More tests => 168;
 use Mail::SpamAssassin;
 use vars qw(%patterns %anti_patterns);
 
@@ -140,7 +140,7 @@ b@www.vohWais0.com	mailto:b\@www\.vohWai
 c.www.moSaoga8.com	www\.moSaoga8\.com
 
 xyz..geifoza0.com	!geifoza0
-xyz.geifoza1.com/..xyz	xyz\.geifoza1\.com	!xyz\.geifoza1\.com/\.\.xyz
+xyz.geifoza1.com/..xyz	xyz\.geifoza1\.com/\.\.xyz
 xyz.geifoza2.CoM	xyz\.geifoza2\.CoM
 http://xyz..geifoza3.com	!geifoza3
 http://xyz.geifoza4.com/..xyz	xyz\.geifoza4\.com/\.\.xyz
@@ -155,12 +155,12 @@ http://www.example.com?xa1kaLuo			\?xa1k
 http://www.example.com#xa1kaLup			\#xa1kaLup
 http://www.lap7thob.com/			^http://www\.lap7thob\.com/$
 
-www.phoh1Koh.com/			^www\.phoh1Koh\.com/$
-www.Tar4caeg.com:80			http://www\.Tar4caeg\.com:80
-www.Coo4mowe.com:80/foo/foo.html	^www\.Coo4mowe\.com:80/foo/foo\.html
-www.Nee2quae.com:80/			^www\.Nee2quae\.com:80/$
-www.foo@Qii3mafs.com:80			http://www\.foo\@Qii3mafs\.com:80$
-www.foo:bar@Qii3maft.com:80		http://www\.foo:bar\@Qii3maft\.com:80$
+www.phoh1Koh.com/			^http://www\.phoh1Koh\.com/$
+www.Tar4caeg.com:80			^http://www\.Tar4caeg\.com:80
+www.Coo4mowe.com:80/foo/foo.html	^http://www\.Coo4mowe\.com:80/foo/foo\.html
+www.Nee2quae.com:80/			^http://www\.Nee2quae\.com:80/$
+www.foo@Qii3mafs.com:80			^http://www\.foo\@Qii3mafs\.com:80$
+www.foo:bar@Qii3maft.com:80		^http://www\.foo:bar\@Qii3maft\.com:80$
 
 HAETEI3D.com	HAETEI3D
 CUK3VEIZ.us	CUK3VEIZ
@@ -181,24 +181,24 @@ ba5keinu.NZ	ba5keinu
 chae2shi.CN	chae2shi
 roo7kiey.TW	roo7kiey
 
-www.Chiew0ch.COM	www\.Chiew0ch\.COM
-www.thohY2qu.US		www\.thohY2qu\.US
-www.teiP7gei.BIZ	www\.teiP7gei\.BIZ
-www.xohThai8.INFO	www\.xohThai8\.INFO
-www.haik7Ram.NET	www\.haik7Ram\.NET
-www.Quaes3se.ORG	www\.Quaes3se\.ORG
-www.Chai6tah.WS		www\.Chai6tah\.WS
-www.Thuoth1y.NAME	www\.Thuoth1y\.NAME
-www.Chieb8ge.TV		www\.Chieb8ge\.TV
-WWW.quus4Rok.cc		WWW\.quus4Rok\.cc
-WWW.maic6Hei.de		WWW\.maic6Hei\.de
-WWW.he4Hiize.jp		WWW\.he4Hiize\.jp
-WWW.Soh1toob.be		WWW\.Soh1toob\.be
-WWW.chahMee5.at		WWW\.chahMee5\.at
-WWW.peepooN0.uk		WWW\.peepooN0\.uk
-WWW.Kiox3phi.nz		WWW\.Kiox3phi\.nz
-WWW.jong3Xou.cn		WWW\.jong3Xou\.cn
-WWW.waeShoe0.tw		WWW\.waeShoe0\.tw
+www.Chiew0ch.COM	^http://www\.Chiew0ch\.COM
+www.thohY2qu.US		^http://www\.thohY2qu\.US
+www.teiP7gei.BIZ	^http://www\.teiP7gei\.BIZ
+www.xohThai8.INFO	^http://www\.xohThai8\.INFO
+www.haik7Ram.NET	^http://www\.haik7Ram\.NET
+www.Quaes3se.ORG	^http://www\.Quaes3se\.ORG
+www.Chai6tah.WS		^http://www\.Chai6tah\.WS
+www.Thuoth1y.NAME	^http://www\.Thuoth1y\.NAME
+www.Chieb8ge.TV		^http://www\.Chieb8ge\.TV
+WWW.quus4Rok.cc		^http://WWW\.quus4Rok\.cc
+WWW.maic6Hei.de		^http://WWW\.maic6Hei\.de
+WWW.he4Hiize.jp		^http://WWW\.he4Hiize\.jp
+WWW.Soh1toob.be		^http://WWW\.Soh1toob\.be
+WWW.chahMee5.at		^http://WWW\.chahMee5\.at
+WWW.peepooN0.uk		^http://WWW\.peepooN0\.uk
+WWW.Kiox3phi.nz		^http://WWW\.Kiox3phi\.nz
+WWW.jong3Xou.cn		^http://WWW\.jong3Xou\.cn
+WWW.waeShoe0.tw		^http://WWW\.waeShoe0\.tw
 
 invalid_ltd.notword	!invalid_tld
 invalid_ltd.invalid	!invalid_tld
@@ -210,6 +210,20 @@ www.invalid_ltd.invalid	!invalid_tld
 www.invalid_ltd.xyzzy	!invalid_tld
 www.invalid_ltd.co.zz	!invalid_tld
 
+# underscores allowed, but not at 1st-2nd level
+uctest.zyb2n2ef.c_om	!zyb2n2ef
+uctest.zyb2_n2ef.com	!zyb2_n2ef
+uc_test.u8uwe8qu.com	^http://uc_test\.u8uwe8qu\.com
+
+# invalid hostnames with -
+http://-sdfisiz2e.com	!sdfisiz2e
+ESRYnSeM7s-.com		!ESRYnSeM7s
+foo-.CgPcASgHNa.com	!CgPcASgHNa
+
+# valid hostnames with -
+www.eZxdy-TWA4z.com	^http://www\.eZxdy-TWA4z\.com
+www-3.WV7jujA10G.com	^http://www-3\.WV7jujA10G\.com
+
 command.com		command\.com
 cmd.exe			!cmd\.exe
 
@@ -219,70 +233,69 @@ com.foo.web		!com\.foo\.web
 
 # IPs for www.yahoo.com
 66.94.230.32		!66\.94\.230\.32
-http://66.94.230.33	http://66\.94\.230\.33
-http://1113515555	http://66\.94\.230\.35
+http://66.94.230.33	^http://66\.94\.230\.33
+http://1113515555	^http://66\.94\.230\.35
 
-gooboo4k@xieyohy0.com		mailto:gooboo4k\@xieyohy0\.com
-mailto:baeb1fai@quo6puyo.com	mailto:baeb1fai\@quo6puyo\.com
+gooboo4k@xieyohy0.com		^mailto:gooboo4k\@xieyohy0\.com
+mailto:baeb1fai@quo6puyo.com	^mailto:baeb1fai\@quo6puyo\.com
 
-http://www.luzoop5k.com		http://www\.luzoop5k\.com
-https://www.luzoop5k.com	https://www\.luzoop5k\.com
-ftp://www.luzoop5k.com		ftp://www\.luzoop5k\.com
-
-Mailto:aaeb1fai@quo6puyo.com	Mailto:aaeb1fai\@quo6puyo\.com
-Http://www.auzoop5k.com		Http://www\.auzoop5k\.com
-Https://www.auzoop5k.com	Https://www\.auzoop5k\.com
-Ftp://www.auzoop5k.com		Ftp://www\.auzoop5k\.com
+http://www.luzoop5k.com		^http://www\.luzoop5k\.com
+https://www.luzoop5k.com	^https://www\.luzoop5k\.com
+ftp://www.luzoop5k.com		^ftp://www\.luzoop5k\.com
+
+Mailto:aaeb1fai@quo6puyo.com	^Mailto:aaeb1fai\@quo6puyo\.com
+Http://www.auzoop5k.com		^Http://www\.auzoop5k\.com
+Https://www.auzoop5k.com	^Https://www\.auzoop5k\.com
+Ftp://www.auzoop5k.com		^Ftp://www\.auzoop5k\.com
 
-mailto:www.luzoop5k.com		!mailto:www\.luzoop5k\.com
+mailto:www.luzoop5k.com		!^mailto:www\.luzoop5k\.com
 # no longer accept file: scheme
-file://www.luzoop5k.com		!file://www\.luzoop5k\.com
+file://www.luzoop5k.com		!^file://www\.luzoop5k\.com
 
 # //<user>:<password>@<host>:<port>/<url-path>
-http://user:pass@jiefeet4.com:80/x/y	http://user:pass\@jiefeet4\.com:80/x/y
+http://user:pass@jiefeet4.com:80/x/y	^http://user:pass\@jiefeet4\.com:80/x/y
 
 www.liy8quei:80				www\.liy8quei\.com
 www.veibi6cu:443			!veibi6cu
-puahi8si.com:80				!puahi8si\.com:80
-chop8tan.com:443			!chop8tan\.com:443
-www.puahi9si.com:80		puahi9si\.com:80
-www.chop9tan.com:443	chop9tan\.com:443
-
-ftp://name@su5queib.ca//etc/motd	ftp://name\@su5queib\.ca//etc/motd
-ftp://name@faikaj4t.dom/%2Fetc/motd	!ftp://name\@faikaj4t\.dom//etc/motd
-ftp://name@faikaj4t.com/%2Fetc/motd	ftp://name\@faikaj4t\.com//etc/motd
+www.puahi9si.com:80			puahi9si\.com:80
+www.puahi9si2.com:80			puahi9si2\.com$
+www.chop9tan.com:443			chop9tan\.com:443
+
+ftp://name@su5queib.ca//etc/motd	^ftp://name\@su5queib\.ca//etc/motd
+ftp://name@faikaj4t.dom/%2Fetc/motd	!^ftp://name\@faikaj4t\.dom//etc/motd
+ftp://name@faikaj4t.com/%2Fetc/motd	^ftp://name\@faikaj4t\.com//etc/motd
 
-keyword:sportscar		!sportscar
+keyword:sportscar			!sportscar
 
 # questionable tests
-mailto://cah3neun@thaihe4d.com		mailto://cah3neun\@thaihe4d\.com
+mailto://cah3neun@thaihe4d.com		^mailto://cah3neun\@thaihe4d\.com
 
 mailto://jicu8vah@another@jicu8vah	!jicu8vah\@another\@jicu8vah
 baeb1fai@@example.com			!baeb1fai\@\@example\.com
-mailto://yie6xuna		!yie6xuna
+mailto://yie6xuna			!yie6xuna
 mailto://yie6xuna@nottld		!yie6xuna\@nottld
 
 <se...@verper.com>	!^http://.*addr\.com\@verper\.com
-<se...@verper.com>	mailto:sentto-4934-foo=addr\.com\@verper\.com
+<se...@verper.com>	^mailto:sentto-4934-foo=addr\.com\@verper\.com
 
 http://foo23498.com/{ESC}(B	^http://foo23498\.com/$
 {ESC}(Bhttp://foo23499.com/	^http://foo23499\.com/$
 http://foo23500.com{ESC}(B/	^http://foo23500\.com(?:/?)$
 
 M0"-AE/9Y.KN:_0D2F:95^H*:I,8	!9Y\.KN
->delimtest1.com	http://delimtest1\.com
-<delimtest2.com	http://delimtest2\.com
-"delimtest3.com	http://delimtest3\.com
-\delimtest4.com	http://delimtest4\.com
-'delimtest5.com	http://delimtest5\.com
-`delimtest6.com	http://delimtest6\.com
-,delimtest7.com	http://delimtest7\.com
-{delimtest8.com	http://delimtest8\.com
-[delimtest9.com	http://delimtest9\.com
-(delimtest10.com	http://delimtest10\.com
-|delimtest11.com	http://delimtest11\.com
- delimtest12.com	http://delimtest12\.com
-ignorethishttp://delimtest13.org	http://delimtest13\.org
+>delimtest1.com	^http://delimtest1\.com
+<delimtest2.com	^http://delimtest2\.com
+"delimtest3.com	^http://delimtest3\.com
+\delimtest4.com	^http://delimtest4\.com
+'delimtest5.com	^http://delimtest5\.com
+`delimtest6.com	^http://delimtest6\.com
+,delimtest7.com	^http://delimtest7\.com
+{delimtest8.com	^http://delimtest8\.com
+[delimtest9.com	^http://delimtest9\.com
+(delimtest10.com	^http://delimtest10\.com
+|delimtest11.com	^http://delimtest11\.com
+ delimtest12.com	^http://delimtest12\.com
+ignorethishttp://delimtest13.org	^http://delimtest13\.org
 donotignorethiswww.delimtest14.com	donotignorethiswww\.delimtest14\.com
 <www.delimtest15.com/foo-~!@#^&*()_+=:;'?,.xyz-~!@#^&*()_+=:;'?,.>	^http://www\.delimtest15\.com/foo-~!\@#\^&\*\(\)_\+=:;'\?,\.xyz$
 .....www.delimtest16.com..........	^http://www\.delimtest16\.com$
@@ -293,7 +306,7 @@ donotignorethiswww.delimtest14.com	donot
 # emails with a comma at the end
 test@delimtest20.com,stuff stuff		delimtest20\.com
 
-# check all the TLDs (might as well be thorough)
+# check some TLDs, no point testing all here
 # the inactive TLDs have negative checks
 
 # first confirm that it will not match on not a TLD
@@ -301,275 +314,12 @@ example.invalid	!^http://example\.invali
 example.zzf	!^http://example\.zzf$
 
 example.ac	^http://example\.ac$
-example.ad	^http://example\.ad$
-example.ae	^http://example\.ae$
-example.aero	^http://example\.aero$
-example.af	^http://example\.af$
-example.ag	^http://example\.ag$
-example.ai	^http://example\.ai$
-example.al	^http://example\.al$
-example.am	^http://example\.am$
-example.an	!^http://example\.an$
-example.ao	^http://example\.ao$
-example.aq	^http://example\.aq$
-example.ar	^http://example\.ar$
-example.arpa	^http://example\.arpa$
-example.as	^http://example\.as$
-example.asia	^http://example\.asia$
-example.at	^http://example\.at$
-example.au	^http://example\.au$
-example.aw	^http://example\.aw$
-example.ax	^http://example\.ax$
-example.az	^http://example\.az$
-example.ba	^http://example\.ba$
-example.bb	^http://example\.bb$
-example.bd	^http://example\.bd$
-example.be	^http://example\.be$
-example.bf	^http://example\.bf$
-example.bg	^http://example\.bg$
-example.bh	^http://example\.bh$
-example.bi	^http://example\.bi$
-example.biz	^http://example\.biz$
-example.bj	^http://example\.bj$
-example.bm	^http://example\.bm$
-example.bn	^http://example\.bn$
-example.bo	^http://example\.bo$
-example.br	^http://example\.br$
-example.bs	^http://example\.bs$
-example.bt	^http://example\.bt$
-example.bv	^http://example\.bv$
-example.bw	^http://example\.bw$
-example.by	^http://example\.by$
-example.bz	^http://example\.bz$
-example.ca	^http://example\.ca$
-example.cat	^http://example\.cat$
-example.cc	^http://example\.cc$
-example.cd	^http://example\.cd$
-example.cf	^http://example\.cf$
-example.cg	^http://example\.cg$
-example.ch	^http://example\.ch$
-example.ci	^http://example\.ci$
-example.ck	^http://example\.ck$
-example.cl	^http://example\.cl$
-example.cm	^http://example\.cm$
-example.cn	^http://example\.cn$
-example.co	^http://example\.co$
-example.com	^http://example\.com$
-example.coop	^http://example\.coop$
-example.cr	^http://example\.cr$
-example.cu	^http://example\.cu$
-example.cv	^http://example\.cv$
-example.cx	^http://example\.cx$
-example.cy	^http://example\.cy$
-example.cz	^http://example\.cz$
-example.de	^http://example\.de$
-example.dj	^http://example\.dj$
-example.dk	^http://example\.dk$
-example.dm	^http://example\.dm$
-example.do	^http://example\.do$
-example.dz	^http://example\.dz$
-example.ec	^http://example\.ec$
-example.edu	^http://example\.edu$
-example.ee	^http://example\.ee$
-example.eg	^http://example\.eg$
-example.er	^http://example\.er$
-example.es	^http://example\.es$
-example.et	^http://example\.et$
 example.eu	^http://example\.eu$
 example.fi	^http://example\.fi$
-example.fj	^http://example\.fj$
-example.fk	^http://example\.fk$
-example.fm	^http://example\.fm$
-example.fo	^http://example\.fo$
-example.fr	^http://example\.fr$
-example.ga	^http://example\.ga$
-example.gb	^http://example\.gb$
-example.gd	^http://example\.gd$
-example.ge	^http://example\.ge$
-example.gf	^http://example\.gf$
-example.gg	^http://example\.gg$
-example.gh	^http://example\.gh$
-example.gi	^http://example\.gi$
-example.gl	^http://example\.gl$
-example.gm	^http://example\.gm$
-example.gn	^http://example\.gn$
-example.gov	^http://example\.gov$
-example.gp	^http://example\.gp$
-example.gq	^http://example\.gq$
-example.gr	^http://example\.gr$
-example.gs	^http://example\.gs$
-example.gt	^http://example\.gt$
-example.gu	^http://example\.gu$
-example.gw	^http://example\.gw$
-example.gy	^http://example\.gy$
-example.hk	^http://example\.hk$
-example.hm	^http://example\.hm$
-example.hn	^http://example\.hn$
-example.hr	^http://example\.hr$
-example.ht	^http://example\.ht$
-example.hu	^http://example\.hu$
-example.id	^http://example\.id$
-example.ie	^http://example\.ie$
-example.il	^http://example\.il$
-example.im	^http://example\.im$
-example.in	^http://example\.in$
-example.info	^http://example\.info$
-example.int	^http://example\.int$
-example.io	^http://example\.io$
-example.iq	^http://example\.iq$
-example.ir	^http://example\.ir$
-example.is	^http://example\.is$
-example.it	^http://example\.it$
-example.je	^http://example\.je$
-example.jm	^http://example\.jm$
-example.jo	^http://example\.jo$
-example.jobs	^http://example\.jobs$
-example.jp	^http://example\.jp$
-example.ke	^http://example\.ke$
-example.kg	^http://example\.kg$
-example.kh	^http://example\.kh$
-example.ki	^http://example\.ki$
-example.km	^http://example\.km$
-example.kn	^http://example\.kn$
-example.kp	^http://example\.kp$
-example.kr	^http://example\.kr$
-example.kw	^http://example\.kw$
-example.ky	^http://example\.ky$
-example.kz	^http://example\.kz$
-example.la	^http://example\.la$
-example.lb	^http://example\.lb$
-example.lc	^http://example\.lc$
-example.li	^http://example\.li$
-example.lk	^http://example\.lk$
-example.lr	^http://example\.lr$
-example.ls	^http://example\.ls$
-example.lt	^http://example\.lt$
-example.lu	^http://example\.lu$
-example.lv	^http://example\.lv$
-example.ly	^http://example\.ly$
-example.ma	^http://example\.ma$
-example.mc	^http://example\.mc$
-example.md	^http://example\.md$
-example.me	^http://example\.me$
-example.mg	^http://example\.mg$
-example.mh	^http://example\.mh$
-example.mil	^http://example\.mil$
-example.mk	^http://example\.mk$
-example.ml	^http://example\.ml$
-example.mm	^http://example\.mm$
-example.mn	^http://example\.mn$
-example.mo	^http://example\.mo$
-example.mobi	^http://example\.mobi$
-example.mp	^http://example\.mp$
-example.mq	^http://example\.mq$
-example.mr	^http://example\.mr$
-example.ms	^http://example\.ms$
-example.mt	^http://example\.mt$
-example.mu	^http://example\.mu$
-example.museum	^http://example\.museum$
-example.mv	^http://example\.mv$
-example.mw	^http://example\.mw$
-example.mx	^http://example\.mx$
-example.my	^http://example\.my$
-example.mz	^http://example\.mz$
-example.na	^http://example\.na$
-example.name	^http://example\.name$
-example.nc	^http://example\.nc$
-example.ne	^http://example\.ne$
-example.net	^http://example\.net$
-example.nf	^http://example\.nf$
-example.ng	^http://example\.ng$
-example.ni	^http://example\.ni$
-example.nl	^http://example\.nl$
-example.no	^http://example\.no$
-example.np	^http://example\.np$
-example.nr	^http://example\.nr$
-example.nu	^http://example\.nu$
-example.nz	^http://example\.nz$
-example.om	^http://example\.om$
-example.org	^http://example\.org$
-example.pa	^http://example\.pa$
-example.pe	^http://example\.pe$
-example.pf	^http://example\.pf$
-example.pg	^http://example\.pg$
-example.ph	^http://example\.ph$
-example.pk	^http://example\.pk$
-example.pl	^http://example\.pl$
-example.pm	^http://example\.pm$
-example.pn	^http://example\.pn$
-example.pr	^http://example\.pr$
-example.pro	^http://example\.pro$
-example.ps	^http://example\.ps$
-example.pt	^http://example\.pt$
-example.pw	^http://example\.pw$
-example.py	^http://example\.py$
-example.qa	^http://example\.qa$
-example.re	^http://example\.re$
-example.ro	^http://example\.ro$
-example.rs	^http://example\.rs$
-example.ru	^http://example\.ru$
-example.rw	^http://example\.rw$
-example.sa	^http://example\.sa$
-example.sb	^http://example\.sb$
-example.sc	^http://example\.sc$
-example.sd	^http://example\.sd$
-example.se	^http://example\.se$
-example.sg	^http://example\.sg$
-example.sh	^http://example\.sh$
-example.si	^http://example\.si$
-example.sj	^http://example\.sj$
-example.sk	^http://example\.sk$
-example.sl	^http://example\.sl$
-example.sm	^http://example\.sm$
-example.sn	^http://example\.sn$
-example.so	^http://example\.so$
-example.sr	^http://example\.sr$
-example.st	^http://example\.st$
-example.su	^http://example\.su$
-example.sv	^http://example\.sv$
-example.sy	^http://example\.sy$
-example.sz	^http://example\.sz$
-example.tc	^http://example\.tc$
-example.td	^http://example\.td$
-example.tel	^http://example\.tel$
-example.tf	^http://example\.tf$
-example.tg	^http://example\.tg$
-example.th	^http://example\.th$
-example.tj	^http://example\.tj$
-example.tk	^http://example\.tk$
-example.tl	^http://example\.tl$
-example.tm	^http://example\.tm$
-example.tn	^http://example\.tn$
-example.to	^http://example\.to$
 example.tp	!^http://example\.tp$
-example.tr	^http://example\.tr$
 example.travel	^http://example\.travel$
-example.tt	^http://example\.tt$
-example.tv	^http://example\.tv$
-example.tw	^http://example\.tw$
-example.tz	^http://example\.tz$
-example.ua	^http://example\.ua$
-example.ug	^http://example\.ug$
-example.uk	^http://example\.uk$
 example.um	!^http://example\.um$
 example.us	^http://example\.us$
-example.uy	^http://example\.uy$
-example.uz	^http://example\.uz$
-example.va	^http://example\.va$
-example.vc	^http://example\.vc$
-example.ve	^http://example\.ve$
-example.vg	^http://example\.vg$
-example.vi	^http://example\.vi$
-example.vn	^http://example\.vn$
-example.vu	^http://example\.vu$
-example.wf	^http://example\.wf$
-example.ws	^http://example\.ws$
-example.ye	^http://example\.ye$
-example.yt	^http://example\.yt$
-example.yu	!^http://example\.yu$
-example.za	^http://example\.za$
-example.zm	^http://example\.zm$
-example.zw	^http://example\.zw$
 
 # with www. prefix tests a different table of TLDs
 
@@ -577,272 +327,12 @@ www.example.foo	^http://www\.example\.fo
 www.example.zzf	!^http://www\.example\.zzf$
 
 www.example.ac	^http://www\.example\.ac$
-www.example.ad	^http://www\.example\.ad$
-www.example.ae	^http://www\.example\.ae$
-www.example.aero	^http://www\.example\.aero$
-www.example.af	^http://www\.example\.af$
-www.example.ag	^http://www\.example\.ag$
-www.example.ai	^http://www\.example\.ai$
-www.example.al	^http://www\.example\.al$
-www.example.am	^http://www\.example\.am$
 www.example.an	!^http://www\.example\.an$
 www.example.ao	^http://www\.example\.ao$
-www.example.aq	^http://www\.example\.aq$
-www.example.ar	^http://www\.example\.ar$
 www.example.arpa	^http://www\.example\.arpa$
-www.example.as	^http://www\.example\.as$
-www.example.asia	^http://www\.example\.asia$
-www.example.at	^http://www\.example\.at$
-www.example.au	^http://www\.example\.au$
-www.example.aw	^http://www\.example\.aw$
-www.example.ax	^http://www\.example\.ax$
-www.example.az	^http://www\.example\.az$
-www.example.ba	^http://www\.example\.ba$
-www.example.bb	^http://www\.example\.bb$
-www.example.bd	^http://www\.example\.bd$
-www.example.be	^http://www\.example\.be$
-www.example.bf	^http://www\.example\.bf$
-www.example.bg	^http://www\.example\.bg$
-www.example.bh	^http://www\.example\.bh$
-www.example.bi	^http://www\.example\.bi$
-www.example.biz	^http://www\.example\.biz$
-www.example.bj	^http://www\.example\.bj$
-www.example.bm	^http://www\.example\.bm$
-www.example.bn	^http://www\.example\.bn$
-www.example.bo	^http://www\.example\.bo$
-www.example.br	^http://www\.example\.br$
-www.example.bs	^http://www\.example\.bs$
-www.example.bt	^http://www\.example\.bt$
-www.example.bv	^http://www\.example\.bv$
-www.example.bw	^http://www\.example\.bw$
-www.example.by	^http://www\.example\.by$
-www.example.bz	^http://www\.example\.bz$
-www.example.ca	^http://www\.example\.ca$
-www.example.cat	^http://www\.example\.cat$
-www.example.cc	^http://www\.example\.cc$
-www.example.cd	^http://www\.example\.cd$
-www.example.cf	^http://www\.example\.cf$
-www.example.cg	^http://www\.example\.cg$
-www.example.ch	^http://www\.example\.ch$
 www.example.ci	^http://www\.example\.ci$
-www.example.ck	^http://www\.example\.ck$
-www.example.cl	^http://www\.example\.cl$
-www.example.cm	^http://www\.example\.cm$
-www.example.cn	^http://www\.example\.cn$
-www.example.co	^http://www\.example\.co$
-www.example.com	^http://www\.example\.com$
-www.example.coop	^http://www\.example\.coop$
-www.example.cr	^http://www\.example\.cr$
-www.example.cu	^http://www\.example\.cu$
-www.example.cv	^http://www\.example\.cv$
-www.example.cx	^http://www\.example\.cx$
-www.example.cy	^http://www\.example\.cy$
-www.example.cz	^http://www\.example\.cz$
-www.example.de	^http://www\.example\.de$
-www.example.dj	^http://www\.example\.dj$
-www.example.dk	^http://www\.example\.dk$
-www.example.dm	^http://www\.example\.dm$
-www.example.do	^http://www\.example\.do$
-www.example.dz	^http://www\.example\.dz$
-www.example.ec	^http://www\.example\.ec$
 www.example.edu	^http://www\.example\.edu$
-www.example.ee	^http://www\.example\.ee$
-www.example.eg	^http://www\.example\.eg$
-www.example.er	^http://www\.example\.er$
-www.example.es	^http://www\.example\.es$
-www.example.et	^http://www\.example\.et$
-www.example.eu	^http://www\.example\.eu$
-www.example.fi	^http://www\.example\.fi$
-www.example.fj	^http://www\.example\.fj$
-www.example.fk	^http://www\.example\.fk$
-www.example.fm	^http://www\.example\.fm$
-www.example.fo	^http://www\.example\.fo$
-www.example.fr	^http://www\.example\.fr$
-www.example.ga	^http://www\.example\.ga$
-www.example.gb	^http://www\.example\.gb$
-www.example.gd	^http://www\.example\.gd$
-www.example.ge	^http://www\.example\.ge$
-www.example.gf	^http://www\.example\.gf$
-www.example.gg	^http://www\.example\.gg$
-www.example.gh	^http://www\.example\.gh$
-www.example.gi	^http://www\.example\.gi$
-www.example.gl	^http://www\.example\.gl$
-www.example.gm	^http://www\.example\.gm$
-www.example.gn	^http://www\.example\.gn$
-www.example.gov	^http://www\.example\.gov$
-www.example.gp	^http://www\.example\.gp$
-www.example.gq	^http://www\.example\.gq$
-www.example.gr	^http://www\.example\.gr$
-www.example.gs	^http://www\.example\.gs$
-www.example.gt	^http://www\.example\.gt$
-www.example.gu	^http://www\.example\.gu$
-www.example.gw	^http://www\.example\.gw$
-www.example.gy	^http://www\.example\.gy$
-www.example.hk	^http://www\.example\.hk$
-www.example.hm	^http://www\.example\.hm$
-www.example.hn	^http://www\.example\.hn$
-www.example.hr	^http://www\.example\.hr$
-www.example.ht	^http://www\.example\.ht$
-www.example.hu	^http://www\.example\.hu$
-www.example.id	^http://www\.example\.id$
-www.example.ie	^http://www\.example\.ie$
-www.example.il	^http://www\.example\.il$
-www.example.im	^http://www\.example\.im$
-www.example.in	^http://www\.example\.in$
-www.example.info	^http://www\.example\.info$
-www.example.int	^http://www\.example\.int$
-www.example.io	^http://www\.example\.io$
-www.example.iq	^http://www\.example\.iq$
-www.example.ir	^http://www\.example\.ir$
-www.example.is	^http://www\.example\.is$
-www.example.it	^http://www\.example\.it$
-www.example.je	^http://www\.example\.je$
-www.example.jm	^http://www\.example\.jm$
-www.example.jo	^http://www\.example\.jo$
-www.example.jobs	^http://www\.example\.jobs$
-www.example.jp	^http://www\.example\.jp$
-www.example.ke	^http://www\.example\.ke$
-www.example.kg	^http://www\.example\.kg$
-www.example.kh	^http://www\.example\.kh$
-www.example.ki	^http://www\.example\.ki$
-www.example.km	^http://www\.example\.km$
-www.example.kn	^http://www\.example\.kn$
-www.example.kp	^http://www\.example\.kp$
-www.example.kr	^http://www\.example\.kr$
-www.example.kw	^http://www\.example\.kw$
-www.example.ky	^http://www\.example\.ky$
-www.example.kz	^http://www\.example\.kz$
-www.example.la	^http://www\.example\.la$
-www.example.lb	^http://www\.example\.lb$
-www.example.lc	^http://www\.example\.lc$
-www.example.li	^http://www\.example\.li$
-www.example.lk	^http://www\.example\.lk$
-www.example.lr	^http://www\.example\.lr$
-www.example.ls	^http://www\.example\.ls$
-www.example.lt	^http://www\.example\.lt$
-www.example.lu	^http://www\.example\.lu$
-www.example.lv	^http://www\.example\.lv$
-www.example.ly	^http://www\.example\.ly$
-www.example.ma	^http://www\.example\.ma$
-www.example.mc	^http://www\.example\.mc$
-www.example.md	^http://www\.example\.md$
-www.example.me	^http://www\.example\.me$
-www.example.mg	^http://www\.example\.mg$
-www.example.mh	^http://www\.example\.mh$
-www.example.mil	^http://www\.example\.mil$
-www.example.mk	^http://www\.example\.mk$
-www.example.ml	^http://www\.example\.ml$
-www.example.mm	^http://www\.example\.mm$
-www.example.mn	^http://www\.example\.mn$
-www.example.mo	^http://www\.example\.mo$
-www.example.mobi	^http://www\.example\.mobi$
-www.example.mp	^http://www\.example\.mp$
-www.example.mq	^http://www\.example\.mq$
-www.example.mr	^http://www\.example\.mr$
-www.example.ms	^http://www\.example\.ms$
-www.example.mt	^http://www\.example\.mt$
-www.example.mu	^http://www\.example\.mu$
-www.example.museum	^http://www\.example\.museum$
-www.example.mv	^http://www\.example\.mv$
-www.example.mw	^http://www\.example\.mw$
-www.example.mx	^http://www\.example\.mx$
-www.example.my	^http://www\.example\.my$
-www.example.mz	^http://www\.example\.mz$
-www.example.na	^http://www\.example\.na$
-www.example.name	^http://www\.example\.name$
-www.example.nc	^http://www\.example\.nc$
-www.example.ne	^http://www\.example\.ne$
-www.example.net	^http://www\.example\.net$
-www.example.nf	^http://www\.example\.nf$
-www.example.ng	^http://www\.example\.ng$
-www.example.ni	^http://www\.example\.ni$
-www.example.nl	^http://www\.example\.nl$
-www.example.no	^http://www\.example\.no$
-www.example.np	^http://www\.example\.np$
-www.example.nr	^http://www\.example\.nr$
-www.example.nu	^http://www\.example\.nu$
-www.example.nz	^http://www\.example\.nz$
-www.example.om	^http://www\.example\.om$
-www.example.org	^http://www\.example\.org$
-www.example.pa	^http://www\.example\.pa$
-www.example.pe	^http://www\.example\.pe$
-www.example.pf	^http://www\.example\.pf$
-www.example.pg	^http://www\.example\.pg$
-www.example.ph	^http://www\.example\.ph$
-www.example.pk	^http://www\.example\.pk$
-www.example.pl	^http://www\.example\.pl$
-www.example.pm	^http://www\.example\.pm$
-www.example.pn	^http://www\.example\.pn$
-www.example.pr	^http://www\.example\.pr$
-www.example.pro	^http://www\.example\.pro$
-www.example.ps	^http://www\.example\.ps$
-www.example.pt	^http://www\.example\.pt$
-www.example.pw	^http://www\.example\.pw$
-www.example.py	^http://www\.example\.py$
-www.example.qa	^http://www\.example\.qa$
-www.example.re	^http://www\.example\.re$
-www.example.ro	^http://www\.example\.ro$
-www.example.rs	^http://www\.example\.rs$
-www.example.ru	^http://www\.example\.ru$
-www.example.rw	^http://www\.example\.rw$
-www.example.sa	^http://www\.example\.sa$
-www.example.sb	^http://www\.example\.sb$
-www.example.sc	^http://www\.example\.sc$
-www.example.sd	^http://www\.example\.sd$
-www.example.se	^http://www\.example\.se$
-www.example.sg	^http://www\.example\.sg$
-www.example.sh	^http://www\.example\.sh$
-www.example.si	^http://www\.example\.si$
-www.example.sj	^http://www\.example\.sj$
-www.example.sk	^http://www\.example\.sk$
-www.example.sl	^http://www\.example\.sl$
-www.example.sm	^http://www\.example\.sm$
-www.example.sn	^http://www\.example\.sn$
-www.example.so	^http://www\.example\.so$
-www.example.sr	^http://www\.example\.sr$
-www.example.st	^http://www\.example\.st$
-www.example.su	^http://www\.example\.su$
-www.example.sv	^http://www\.example\.sv$
-www.example.sy	^http://www\.example\.sy$
-www.example.sz	^http://www\.example\.sz$
-www.example.tc	^http://www\.example\.tc$
-www.example.td	^http://www\.example\.td$
-www.example.tel	^http://www\.example\.tel$
-www.example.tf	^http://www\.example\.tf$
-www.example.tg	^http://www\.example\.tg$
-www.example.th	^http://www\.example\.th$
-www.example.tj	^http://www\.example\.tj$
-www.example.tk	^http://www\.example\.tk$
-www.example.tl	^http://www\.example\.tl$
-www.example.tm	^http://www\.example\.tm$
-www.example.tn	^http://www\.example\.tn$
-www.example.to	^http://www\.example\.to$
 www.example.tp	!^http://www\.example\.tp$
-www.example.tr	^http://www\.example\.tr$
-www.example.travel	^http://www\.example\.travel$
-www.example.tt	^http://www\.example\.tt$
-www.example.tv	^http://www\.example\.tv$
-www.example.tw	^http://www\.example\.tw$
-www.example.tz	^http://www\.example\.tz$
-www.example.ua	^http://www\.example\.ua$
-www.example.ug	^http://www\.example\.ug$
-www.example.uk	^http://www\.example\.uk$
-www.example.um	!^http://www\.example\.um$
-www.example.us	^http://www\.example\.us$
-www.example.uy	^http://www\.example\.uy$
-www.example.uz	^http://www\.example\.uz$
-www.example.va	^http://www\.example\.va$
-www.example.vc	^http://www\.example\.vc$
-www.example.ve	^http://www\.example\.ve$
-www.example.vg	^http://www\.example\.vg$
-www.example.vi	^http://www\.example\.vi$
-www.example.vn	^http://www\.example\.vn$
-www.example.vu	^http://www\.example\.vu$
-www.example.wf	^http://www\.example\.wf$
 www.example.ws	^http://www\.example\.ws$
-www.example.ye	^http://www\.example\.ye$
-www.example.yt	^http://www\.example\.yt$
 www.example.yu	!^http://www\.example\.yu$
 www.example.za	^http://www\.example\.za$
-www.example.zm	^http://www\.example\.zm$
-www.example.zw	^http://www\.example\.zw$