You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/14 09:37:00 UTC
svn commit: r1865102 - in /spamassassin/branches/3.4:
lib/Mail/SpamAssassin/PerMsgStatus.pm t/uri_text.t
Author: hege
Date: Wed Aug 14 09:37:00 2019
New Revision: 1865102
URL: http://svn.apache.org/viewvc?rev=1865102&view=rev
Log:
Commit all uri parser changes from trunk to 3.4
Modified:
spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/branches/3.4/t/uri_text.t
Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865102&r1=1865101&r2=1865102&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 14 09:37:00 2019
@@ -268,6 +268,7 @@ sub new {
'async' => Mail::SpamAssassin::AsyncLoop->new($main),
'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
+ 'uri_detail_list' => { },
};
dbg("check: pms new, time limit in %.3f s",
@@ -912,14 +913,13 @@ sub get_content_preview {
my ($self) = @_;
my $str = '';
- my $ary = $self->get_decoded_stripped_body_text_array();
- shift @{$ary}; # drop the subject line
+ my @ary = @{$self->get_decoded_stripped_body_text_array()};
+ shift @ary; # drop the subject line
my $numlines = 3;
- while (length ($str) < 200 && @{$ary} && $numlines-- > 0) {
- $str .= shift @{$ary};
+ while (length ($str) < 200 && @ary && $numlines-- > 0) {
+ $str .= shift @ary;
}
- undef $ary;
# in case the last line was huge, trim it back to around 200 chars
local $1;
@@ -2132,12 +2132,12 @@ sub _tbirdurire {
my ($self) = @_;
# Cached?
- return $self->{tbirdurire} if $self->{tbirdurire};
+ return $self->{tbirdurire} if exists $self->{tbirdurire};
# a hybrid of tbird and oe's version of uri parsing
- my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b"; # The \x1b as per bug 4522
+ my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
my $iso2022shift = "\x1b" . '\(.'; # bug 4522
- my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b"; # The \x1b as per bug 4522
+ my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
my $nonASCII = '\x80-\xff';
# bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
@@ -2180,18 +2180,18 @@ sub get_uri_list {
my ($self) = @_;
# use cached answer if available
- if (defined $self->{uri_list}) {
+ if (exists $self->{uri_list}) {
return @{$self->{uri_list}};
}
- my @uris;
+ my %uris;
# $self->{redirect_num} = 0;
- # get URIs from HTML parsing
+ # get URIs from text/HTML parsing
while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
if ($info->{cleaned}) {
foreach (@{$info->{cleaned}}) {
- push(@uris, $_);
+ $uris{$_} = 1;
# count redirection attempts and log it
# if (my @http = m{\b(https?:/{0,2})}gi) {
@@ -2201,10 +2201,10 @@ sub get_uri_list {
}
}
- $self->{uri_list} = \@uris;
+ @{$self->{uri_list}} = keys %uris;
# $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
- return @uris;
+ return @{$self->{uri_list}};
}
=item $status->get_uri_detail_list ()
@@ -2214,24 +2214,30 @@ various data about where the URIs were f
combination of the URIs found in the rendered (decoded and HTML stripped)
body and the URIs found when parsing the HTML in the message. Will also
set $status->{uri_detail_list} (the hash reference as returned by this
-function). This function will also set $status->{uri_domain_count} (count of
-unique domains).
+function).
The hash format looks something like this:
raw_uri => {
- types => { a => 1, img => 1, parsed => 1 },
+ types => { a => 1, img => 1, parsed => 1, domainkeys => 1,
+ unlinked => 1, schemeless => 1 },
cleaned => [ canonicalized_uri ],
anchor_text => [ "click here", "no click here" ],
domains => { domain1 => 1, domain2 => 1 },
+ hosts => { host1 => domain1, host2 => domain2 },
}
C<raw_uri> is whatever the URI was in the message itself
-(http://spamassassin.apache%2Eorg/).
+(http://spamassassin.apache%2Eorg/). Uris parsed from text will be prefixed
+with scheme if missing (http://, mailto: etc). HTML uris are as found.
-C<types> is a hash of the HTML tags (lowercase) which referenced
-the raw_uri. I<parsed> is a faked type which specifies that the
-raw_uri was seen in the rendered text.
+C<types> is a hash of the HTML tags (lowercase) which referenced the
+raw_uri. I<parsed> is a faked type which specifies that the raw_uri was
+seen in the rendered text. I<domainkeys> is defined when raw_uri was found
+from DK/DKIM d= field. I<unlinked> is defined when it's assumed that MUA
+will not linkify uri (found in body without scheme or www. prefix).
+I<schemeless> is always added for uris without scheme, regardless of
+linkifying (i.e. email address found in body without mailto:).
C<cleaned> is an array of the raw and canonicalized version of the raw_uri
(http://spamassassin.apache%2Eorg/, https://spamassassin.apache.org/).
@@ -2249,267 +2255,255 @@ as hash keys, with their domain part sto
sub get_uri_detail_list {
my ($self) = @_;
- # use cached answer if available
- if (defined $self->{uri_detail_list}) {
+ # process only once, use unique uri_detail_list_run flag,
+ # in case add_uri_detail_list has already been called
+ if ($self->{uri_detail_list_run}) {
return $self->{uri_detail_list};
}
+ $self->{uri_detail_list_run} = 1;
my $timer = $self->{main}->time_method("get_uri_detail_list");
- $self->{uri_domain_count} = 0;
-
- # do this so we're sure metadata->html is setup
- my %parsed = map { $_ => 'parsed' } $self->_get_parsed_uri_list();
-
+ # process text parsed uris
+ $self->_process_text_uri_list();
+ # process html uris
+ $self->_process_html_uri_list();
+ # process dkim uris
+ $self->_process_dkim_uri_list();
+
+ return $self->{uri_detail_list};
+}
+
+sub _process_text_uri_list {
+ my ($self) = @_;
+
+ # Use decoded stripped body, which does not contain HTML
+ my $textary = $self->get_decoded_stripped_body_text_array();
+ my $tbirdurire = $self->_tbirdurire;
+ my %seen;
+ my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
+
+ foreach my $text (@$textary) {
+ # a workaround for [perl #69973] bug:
+ # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
+ # Bug 6225, regexp and string should both be utf8, or none of them;
+ # untainting string also seems to avoid the crash
+ #
+ # Bug 6225: untaint the string in an attempt to work around a perl crash
+ local $_ = untaint_var($text);
+
+ local($1,$2,$3);
+ while (/$tbirdurire/igo) {
+ my $rawuri = $1||$2||$3;
+ my $schost = $4;
+ my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
+ $rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
+ $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
+
+ next if exists $seen{$rawuri};
+ $seen{$rawuri} = 1;
+
+ dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
+
+ # Quick ignore if schemeless host not valid
+ next if defined $schost && !is_fqdn_valid($schost, 1);
+
+ # Ignore cid: mid: as they can be mistaken for emails,
+ # these should not be parsed from stripped body in any case.
+ # Example: [cid:image001.png@01D4986E.E3459640]
+ next if $rawuri =~ /^[cm]id:/i;
+
+ # Ignore empty uris
+ next if $rawuri =~ /^\w+:\/{0,2}$/i;
+
+ my $types = {parsed => 1};
+
+ # If it's a hostname that was just sitting out in the
+ # open, without a protocol, and not inside of an HTML tag,
+ # the we should add the proper protocol in front, rather
+ # than using the base URI.
+ my $uri = $rawuri;
+ if ($uri !~ /^(?:https?|ftp|mailto):/i) {
+ if ($uri =~ /^ftp\./i) {
+ $uri = "ftp://$uri";
+ }
+ elsif ($uri =~ /^www\d{0,2}\./i) {
+ $uri = "http://$uri";
+ }
+ elsif (index($uri, '@') != -1) {
+ # Ignore schemeless emails without valid tld, matches crap like
+ # Vi@gra. No urldecoding is done for tld test which is fine.
+ # This is not linkified by MUAs: foo@bar%2Ecom
+ # This IS linkified: foo@bar%2Ebar.com
+ # And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
+ # And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
+ # Don't test when ? or & exists, canonicalizing will handle later.
+ if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
+ next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
+ }
+ next if index($uri, ' ') != -1; # ignore garbled
+ $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
+ # Urldecode now
+ $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+ $uri = "mailto:$uri";
+ }
+ else {
+ # some spammers are using unschemed URIs to escape filters
+ # flag that this is a URI that MUAs don't linkify so only use for RBLs
+ # (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
+ $uri = "http://$uri";
+ $types->{unlinked} = 1;
+ }
+ # Mark any of those schemeless
+ $types->{schemeless} = 1;
+ }
+ elsif ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
+ # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
+ $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+ # Skip unless @ found after decoding, then check tld is valid
+ next unless $uri =~ /\@([^?&>]*)/;
+ next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
+ }
- # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
- # this functionality
- # 2013-01-07
- # This functionality is re-enabled as a configuration option disabled by
- # default (bug 7087)
- # 2014-10-06
+ dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
- # Look for the domain in DK/DKIM headers
- if ( $self->{conf}->{parse_dkim_uris} ) {
- my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef),
- $self->get('DKIM-Signature',undef) ));
- while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
- my $dom = $1;
- $dom =~ s/\s+//g;
- next if !is_fqdn_valid($dom);
- next if !$self->{main}->{registryboundaries}->is_domain_valid($dom);
- $parsed{$dom} = 'domainkeys';
+ $self->add_uri_detail_list($uri, $types, 'parsed', 1);
}
}
+}
+
+sub _process_html_uri_list {
+ my ($self) = @_;
# get URIs from HTML parsing
# use the metadata version since $self->{html} may not be setup
my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
$self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
- # don't keep dereferencing ...
- my $redirector_patterns = $self->{conf}->{redirector_patterns};
-
# canonicalize the HTML parsed URIs
while(my($uri, $info) = each %{ $detail }) {
- my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
- $info->{cleaned} = \@tmp;
-
- foreach (@tmp) {
- my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
- if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
- # unstripped full host name as a key, and its domain part as a value
- $info->{hosts}->{$host} = $domain;
- if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}) {
- $info->{domains}->{$domain} = 1; # stripped to domain boundary
- $self->{uri_domain_count}++;
- }
- }
- }
-
- if (would_log('dbg', 'uri') == 2) {
- dbg("uri: html uri found, $uri");
- foreach my $nuri (@tmp) {
- dbg("uri: cleaned html uri, $nuri");
- }
- if ($info->{hosts} && $info->{domains}) {
- for my $host (keys %{$info->{hosts}}) {
- dbg("uri: html host %s, domain %s", $host, $info->{hosts}->{$host});
+ if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
+ # Need also to copy and uniq anchor text
+ if (exists $info->{anchor_text}) {
+ my %seen;
+ foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
+ push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
}
}
}
}
+}
- # canonicalize the text parsed URIs
- while (my($uri, $type) = each %parsed) {
- $detail->{$uri}->{types}->{$type} = 1;
- my $info = $detail->{$uri};
-
- my @uris;
-
- if (!exists $info->{cleaned}) {
- if ($type eq 'parsed') {
- @uris = uri_list_canonicalize($redirector_patterns, $uri);
- }
- else {
- @uris = ( $uri );
- }
- $info->{cleaned} = \@uris;
+sub _process_dkim_uri_list {
+ my ($self) = @_;
- foreach (@uris) {
- my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
- if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
- # unstripped full host name as a key, and its domain part as a value
- $info->{hosts}->{$host} = $domain;
- if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}){
- $info->{domains}->{$domain} = 1;
- $self->{uri_domain_count}++;
- }
- }
- }
- }
+ # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
+ # this functionality
+ # 2013-01-07
+ # This functionality is re-enabled as a configuration option disabled by
+ # default (bug 7087)
+ # 2014-10-06
- if (would_log('dbg', 'uri') == 2) {
- dbg("uri: parsed uri found of type $type, $uri");
- foreach my $nuri (@uris) {
- dbg("uri: cleaned parsed uri, $nuri");
- }
- if ($info->{hosts} && $info->{domains}) {
- for my $host (keys %{$info->{hosts}}) {
- dbg("uri: parsed host %s, domain %s", $host, $info->{hosts}->{$host});
- }
- }
+ # Look for the domain in DK/DKIM headers
+ if ($self->{conf}->{parse_dkim_uris}) {
+ my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef ),
+ $self->get('DKIM-Signature',undef) ));
+ while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
+ my $d = $1;
+ $d =~ s/\s+//g;
+ # prefix with domainkeys: so it doesn't merge with identical keys
+ $self->add_uri_detail_list("domainkeys:$d",
+ {'domainkeys'=>1, 'nocanon'=>1, 'noclean'=>1},
+ 'domainkeys', 1);
}
}
+}
- # setup the cache
- $self->{uri_detail_list} = $detail;
+=item $status->add_uri_detail_list ($raw_uri, $types, $source, $valid_domain)
- return $detail;
-}
+Adds values to internal uri_detail_list. When used from Plugins, recommended
+to call from parsed_metadata (along with register_method_priority, -10) so
+other Plugins calling get_uri_detail_list() will see it.
-sub _get_parsed_uri_list {
- my ($self) = @_;
+C<raw_uri> is the URI to be added. The only required parameter.
- # use cached answer if available
- unless (defined $self->{parsed_uri_list}) {
- # TVD: we used to use decoded_body which is fine, except then we'll
- # try parsing URLs out of HTML, which is what the HTML code is going
- # to do (note: we know the HTML parsing occurs, because we call for the
- # rendered text which does HTML parsing...) trying to get URLs out of
- # HTML w/out parsing causes issues, so let's not do it.
- # also, if we allow $textary to be passed in, we need to invalidate
- # the cache first. fyi.
- my $textary = $self->get_decoded_stripped_body_text_array();
- my $redirector_patterns = $self->{conf}->{redirector_patterns};
-
- my ($rulename, $pat, @uris);
- my $text;
- my $tbirdurire = $self->_tbirdurire;
- my %seen;
- my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
-
- foreach my $entry (@$textary) {
-
- # a workaround for [perl #69973] bug:
- # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
- # Bug 6225, regexp and string should both be utf8, or none of them;
- # untainting string also seems to avoid the crash
- #
- # Bug 6225: untaint the string in an attempt to work around a perl crash
- local $_ = untaint_var($entry);
+C<types> is an optional hash reference, contents are added to
+uri_detail_list->{types} (see get_uri_detail_list for known keys).
+I<parsed> is default is no hash given. I<nocanon> does not run
+uri_list_canonicalize (no redirector, uri fixing). I<noclean> skips adding
+uri_detail_list->{cleaned}, so it would not be used in "uri" rule checks,
+but domain/hosts would still be used for URIBL/RBL purposes.
- local($1,$2,$3);
- while (/$tbirdurire/igo) {
- my $rawuri = $1||$2||$3;
- my $schost = $4;
- my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
- $rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
- $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
-
- next if exists $seen{$rawuri};
- $seen{$rawuri} = 1;
-
- dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
-
- # Quick ignore if schemeless host not valid
- next if defined $schost && !is_fqdn_valid($schost);
-
- # Ignore cid: mid: as they can be mistaken for emails,
- # these should not be parsed from stripped body in any case.
- # Example: [cid:image001.png@01D4986E.E3459640]
- next if $rawuri =~ /^[cm]id:/i;
-
- # Ignore empty uris
- next if $rawuri =~ /^\w+:\/{0,2}$/i;
-
- # skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
- next if $rawuri =~ m{^(?:(?:https?|ftp|mailto):(?://)?)?(?:[^\@/?#]*\@)?[^/?#:]*\.\.}i;
-
- # If it's a hostname that was just sitting out in the
- # open, without a protocol, and not inside of an HTML tag,
- # the we should add the proper protocol in front, rather
- # than using the base URI.
- my $uri = $rawuri;
- my $rblonly;
- if ($uri !~ /^(?:https?|ftp|mailto|javascript|file):/i) {
- if ($uri =~ /^ftp\./i) {
- $uri = "ftp://$uri";
- }
- elsif ($uri =~ /^www\d{0,2}\./i) {
- $uri = "http://$uri";
- }
- elsif (index($uri, '@') != -1) {
- # Ignore schemeless emails without valid tld, matches crap like
- # Vi@gra. No urldecoding is done for tld test which is fine.
- # This is not linkified by MUAs: foo@bar%2Ecom
- # This IS linkified: foo@bar%2Ebar.com
- # And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
- # And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
- # Don't test when ? or & exists, canonicalizing will handle later.
- if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
- next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
- }
- next if index($uri, ' ') != -1; # ignore garbled
- $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
- $uri = "mailto:$uri";
- }
- else {
- # some spammers are using unschemed URIs to escape filters
- $rblonly = 1; # flag that this is a URI that MUAs don't linkify so only use for RBLs
- $uri = "http://$uri";
- }
- }
+C<source> is an optional simple string, only used for debug logging purposes
+to identify where uri originates from (default: "parsed").
- if ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
- # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
- $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
- # Skip unless @ found after decoding, then check tld is valid
- next unless $uri =~ /\@([^?&>]*)/;
- next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
- # SA 3.4 legacy code continues
- my $domuri = $self->{main}->{registryboundaries}->uri_to_domain($uri);
- next unless $domuri;
- push (@uris, $rawuri);
- push (@uris, $uri) unless ($rawuri eq $uri);
- }
+C<valid_domain> is an optional boolean (0/1). If true, uri will not be
+added unless hostname/domain is in valid format and contains a valid TLD.
+(default: 0)
- next unless ($uri =~/^(?:https?|ftp):/i); # at this point only valid if one or the other of these
+=cut
- my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
- my $goodurifound = 0;
- foreach my $cleanuri (@tmp) {
- my $domain = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
- if ($domain) {
- # bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
- if ($rblonly) {
- local $1;
- $cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/i;
- }
- push (@uris, $cleanuri);
- $goodurifound = 1;
- }
- }
- next unless $goodurifound;
- push @uris, $rawuri unless $rblonly;
- }
- }
+sub add_uri_detail_list {
+ my ($self, $uri, $types, $source, $valid_domain) = @_;
+
+ $types = {'parsed' => 1} unless defined $types;
+ $source ||= 'parsed';
+ my (%domains, %hosts, %cleaned);
+ my $udl = $self->{uri_detail_list};
+
+ dbg("uri: canonicalizing $source uri: $uri");
+
+ my @uris;
+ if ($types->{nocanon}) {
+ push @uris, $uri;
+ } else {
+ @uris = uri_list_canonicalize($self->{conf}->{redirector_patterns}, $uri);
+ }
+ foreach my $cleanuri (@uris) {
# Make sure all the URIs are nice and short
- foreach my $uri ( @uris ) {
- if (length $uri > MAX_URI_LENGTH) {
- $self->{'uri_truncated'} = 1;
- $uri = substr $uri, 0, MAX_URI_LENGTH;
- }
+ if (length($cleanuri) > MAX_URI_LENGTH) {
+ $self->{'uri_truncated'} = 1;
+ $cleanuri = substr($cleanuri, 0, MAX_URI_LENGTH);
+ }
+ dbg("uri: cleaned uri: $cleanuri");
+ $cleaned{$cleanuri} = 1;
+ my ($domain, $host) = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
+ if (defined $domain) {
+ dbg("uri: added host: $host domain: $domain");
+ $domains{$domain} = 1;
+ $hosts{$host} = $domain;
}
+ }
+
+ # Bail out if no good uri found
+ return unless %cleaned;
+
+ # Bail out if no domains/hosts found?
+ return if $valid_domain && !%domains;
- # setup the cache and return
- $self->{parsed_uri_list} = \@uris;
+ # Merge cleaned
+ if (!$types->{noclean}) {
+ if ($udl->{$uri}->{cleaned}) {
+ $cleaned{$_} = 1 foreach (@{$udl->{$uri}->{cleaned}});
+ }
+ @{$udl->{$uri}->{cleaned}} = keys %cleaned;
}
- return @{$self->{parsed_uri_list}};
+ # Domains/hosts (there might not be any)
+ $udl->{$uri}->{domains}->{$_} = 1 foreach keys %domains;
+ $udl->{$uri}->{hosts}->{$_} = $hosts{$_} foreach keys %hosts;
+
+ # Types
+ $udl->{$uri}->{types}->{$_} = 1 foreach keys %$types;
+
+ # Invalidate uri_list cache
+ delete $self->{uri_list};
+
+ return 1;
}
+
###########################################################################
sub ensure_rules_are_complete {
@@ -2526,13 +2520,15 @@ sub ensure_rules_are_complete {
my $start = time;
$self->harvest_until_rule_completes($r);
- my $elapsed = time - $start;
+ my $elapsed = sprintf "%.2f", time - $start;
if (!$self->is_rule_complete($r)) {
dbg("rules: rule $r is still not complete; exited early?");
}
elsif ($elapsed > 0) {
- info("rules: $r took $elapsed seconds to complete, for $metarule");
+ my $txt = "rules: $r took $elapsed seconds to complete, for $metarule";
+ # Info only if something took over 1 sec to wait, prevent log flood
+ if ($elapsed >= 1) { info($txt); } else { dbg($txt); }
}
}
}
Modified: spamassassin/branches/3.4/t/uri_text.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/t/uri_text.t?rev=1865102&r1=1865101&r2=1865102&view=diff
==============================================================================
--- spamassassin/branches/3.4/t/uri_text.t (original)
+++ spamassassin/branches/3.4/t/uri_text.t Wed Aug 14 09:37:00 2019
@@ -20,7 +20,7 @@ if (-e 'test_dir') { # runnin
use strict;
use lib '.'; use lib 't';
use SATest; sa_t_init("uri_text");
-use Test::More tests => 685;
+use Test::More tests => 168;
use Mail::SpamAssassin;
use vars qw(%patterns %anti_patterns);
@@ -140,7 +140,7 @@ b@www.vohWais0.com mailto:b\@www\.vohWai
c.www.moSaoga8.com www\.moSaoga8\.com
xyz..geifoza0.com !geifoza0
-xyz.geifoza1.com/..xyz xyz\.geifoza1\.com !xyz\.geifoza1\.com/\.\.xyz
+xyz.geifoza1.com/..xyz xyz\.geifoza1\.com/\.\.xyz
xyz.geifoza2.CoM xyz\.geifoza2\.CoM
http://xyz..geifoza3.com !geifoza3
http://xyz.geifoza4.com/..xyz xyz\.geifoza4\.com/\.\.xyz
@@ -155,12 +155,12 @@ http://www.example.com?xa1kaLuo \?xa1k
http://www.example.com#xa1kaLup \#xa1kaLup
http://www.lap7thob.com/ ^http://www\.lap7thob\.com/$
-www.phoh1Koh.com/ ^www\.phoh1Koh\.com/$
-www.Tar4caeg.com:80 http://www\.Tar4caeg\.com:80
-www.Coo4mowe.com:80/foo/foo.html ^www\.Coo4mowe\.com:80/foo/foo\.html
-www.Nee2quae.com:80/ ^www\.Nee2quae\.com:80/$
-www.foo@Qii3mafs.com:80 http://www\.foo\@Qii3mafs\.com:80$
-www.foo:bar@Qii3maft.com:80 http://www\.foo:bar\@Qii3maft\.com:80$
+www.phoh1Koh.com/ ^http://www\.phoh1Koh\.com/$
+www.Tar4caeg.com:80 ^http://www\.Tar4caeg\.com:80
+www.Coo4mowe.com:80/foo/foo.html ^http://www\.Coo4mowe\.com:80/foo/foo\.html
+www.Nee2quae.com:80/ ^http://www\.Nee2quae\.com:80/$
+www.foo@Qii3mafs.com:80 ^http://www\.foo\@Qii3mafs\.com:80$
+www.foo:bar@Qii3maft.com:80 ^http://www\.foo:bar\@Qii3maft\.com:80$
HAETEI3D.com HAETEI3D
CUK3VEIZ.us CUK3VEIZ
@@ -181,24 +181,24 @@ ba5keinu.NZ ba5keinu
chae2shi.CN chae2shi
roo7kiey.TW roo7kiey
-www.Chiew0ch.COM www\.Chiew0ch\.COM
-www.thohY2qu.US www\.thohY2qu\.US
-www.teiP7gei.BIZ www\.teiP7gei\.BIZ
-www.xohThai8.INFO www\.xohThai8\.INFO
-www.haik7Ram.NET www\.haik7Ram\.NET
-www.Quaes3se.ORG www\.Quaes3se\.ORG
-www.Chai6tah.WS www\.Chai6tah\.WS
-www.Thuoth1y.NAME www\.Thuoth1y\.NAME
-www.Chieb8ge.TV www\.Chieb8ge\.TV
-WWW.quus4Rok.cc WWW\.quus4Rok\.cc
-WWW.maic6Hei.de WWW\.maic6Hei\.de
-WWW.he4Hiize.jp WWW\.he4Hiize\.jp
-WWW.Soh1toob.be WWW\.Soh1toob\.be
-WWW.chahMee5.at WWW\.chahMee5\.at
-WWW.peepooN0.uk WWW\.peepooN0\.uk
-WWW.Kiox3phi.nz WWW\.Kiox3phi\.nz
-WWW.jong3Xou.cn WWW\.jong3Xou\.cn
-WWW.waeShoe0.tw WWW\.waeShoe0\.tw
+www.Chiew0ch.COM ^http://www\.Chiew0ch\.COM
+www.thohY2qu.US ^http://www\.thohY2qu\.US
+www.teiP7gei.BIZ ^http://www\.teiP7gei\.BIZ
+www.xohThai8.INFO ^http://www\.xohThai8\.INFO
+www.haik7Ram.NET ^http://www\.haik7Ram\.NET
+www.Quaes3se.ORG ^http://www\.Quaes3se\.ORG
+www.Chai6tah.WS ^http://www\.Chai6tah\.WS
+www.Thuoth1y.NAME ^http://www\.Thuoth1y\.NAME
+www.Chieb8ge.TV ^http://www\.Chieb8ge\.TV
+WWW.quus4Rok.cc ^http://WWW\.quus4Rok\.cc
+WWW.maic6Hei.de ^http://WWW\.maic6Hei\.de
+WWW.he4Hiize.jp ^http://WWW\.he4Hiize\.jp
+WWW.Soh1toob.be ^http://WWW\.Soh1toob\.be
+WWW.chahMee5.at ^http://WWW\.chahMee5\.at
+WWW.peepooN0.uk ^http://WWW\.peepooN0\.uk
+WWW.Kiox3phi.nz ^http://WWW\.Kiox3phi\.nz
+WWW.jong3Xou.cn ^http://WWW\.jong3Xou\.cn
+WWW.waeShoe0.tw ^http://WWW\.waeShoe0\.tw
invalid_ltd.notword !invalid_tld
invalid_ltd.invalid !invalid_tld
@@ -210,6 +210,20 @@ www.invalid_ltd.invalid !invalid_tld
www.invalid_ltd.xyzzy !invalid_tld
www.invalid_ltd.co.zz !invalid_tld
+# underscores allowed, but not at 1st-2nd level
+uctest.zyb2n2ef.c_om !zyb2n2ef
+uctest.zyb2_n2ef.com !zyb2_n2ef
+uc_test.u8uwe8qu.com ^http://uc_test\.u8uwe8qu\.com
+
+# invalid hostnames with -
+http://-sdfisiz2e.com !sdfisiz2e
+ESRYnSeM7s-.com !ESRYnSeM7s
+foo-.CgPcASgHNa.com !CgPcASgHNa
+
+# valid hostnames with -
+www.eZxdy-TWA4z.com ^http://www\.eZxdy-TWA4z\.com
+www-3.WV7jujA10G.com ^http://www-3\.WV7jujA10G\.com
+
command.com command\.com
cmd.exe !cmd\.exe
@@ -219,70 +233,69 @@ com.foo.web !com\.foo\.web
# IPs for www.yahoo.com
66.94.230.32 !66\.94\.230\.32
-http://66.94.230.33 http://66\.94\.230\.33
-http://1113515555 http://66\.94\.230\.35
+http://66.94.230.33 ^http://66\.94\.230\.33
+http://1113515555 ^http://66\.94\.230\.35
-gooboo4k@xieyohy0.com mailto:gooboo4k\@xieyohy0\.com
-mailto:baeb1fai@quo6puyo.com mailto:baeb1fai\@quo6puyo\.com
+gooboo4k@xieyohy0.com ^mailto:gooboo4k\@xieyohy0\.com
+mailto:baeb1fai@quo6puyo.com ^mailto:baeb1fai\@quo6puyo\.com
-http://www.luzoop5k.com http://www\.luzoop5k\.com
-https://www.luzoop5k.com https://www\.luzoop5k\.com
-ftp://www.luzoop5k.com ftp://www\.luzoop5k\.com
-
-Mailto:aaeb1fai@quo6puyo.com Mailto:aaeb1fai\@quo6puyo\.com
-Http://www.auzoop5k.com Http://www\.auzoop5k\.com
-Https://www.auzoop5k.com Https://www\.auzoop5k\.com
-Ftp://www.auzoop5k.com Ftp://www\.auzoop5k\.com
+http://www.luzoop5k.com ^http://www\.luzoop5k\.com
+https://www.luzoop5k.com ^https://www\.luzoop5k\.com
+ftp://www.luzoop5k.com ^ftp://www\.luzoop5k\.com
+
+Mailto:aaeb1fai@quo6puyo.com ^Mailto:aaeb1fai\@quo6puyo\.com
+Http://www.auzoop5k.com ^Http://www\.auzoop5k\.com
+Https://www.auzoop5k.com ^Https://www\.auzoop5k\.com
+Ftp://www.auzoop5k.com ^Ftp://www\.auzoop5k\.com
-mailto:www.luzoop5k.com !mailto:www\.luzoop5k\.com
+mailto:www.luzoop5k.com !^mailto:www\.luzoop5k\.com
# no longer accept file: scheme
-file://www.luzoop5k.com !file://www\.luzoop5k\.com
+file://www.luzoop5k.com !^file://www\.luzoop5k\.com
# //<user>:<password>@<host>:<port>/<url-path>
-http://user:pass@jiefeet4.com:80/x/y http://user:pass\@jiefeet4\.com:80/x/y
+http://user:pass@jiefeet4.com:80/x/y ^http://user:pass\@jiefeet4\.com:80/x/y
www.liy8quei:80 www\.liy8quei\.com
www.veibi6cu:443 !veibi6cu
-puahi8si.com:80 !puahi8si\.com:80
-chop8tan.com:443 !chop8tan\.com:443
-www.puahi9si.com:80 puahi9si\.com:80
-www.chop9tan.com:443 chop9tan\.com:443
-
-ftp://name@su5queib.ca//etc/motd ftp://name\@su5queib\.ca//etc/motd
-ftp://name@faikaj4t.dom/%2Fetc/motd !ftp://name\@faikaj4t\.dom//etc/motd
-ftp://name@faikaj4t.com/%2Fetc/motd ftp://name\@faikaj4t\.com//etc/motd
+www.puahi9si.com:80 puahi9si\.com:80
+www.puahi9si2.com:80 puahi9si2\.com$
+www.chop9tan.com:443 chop9tan\.com:443
+
+ftp://name@su5queib.ca//etc/motd ^ftp://name\@su5queib\.ca//etc/motd
+ftp://name@faikaj4t.dom/%2Fetc/motd !^ftp://name\@faikaj4t\.dom//etc/motd
+ftp://name@faikaj4t.com/%2Fetc/motd ^ftp://name\@faikaj4t\.com//etc/motd
-keyword:sportscar !sportscar
+keyword:sportscar !sportscar
# questionable tests
-mailto://cah3neun@thaihe4d.com mailto://cah3neun\@thaihe4d\.com
+mailto://cah3neun@thaihe4d.com ^mailto://cah3neun\@thaihe4d\.com
mailto://jicu8vah@another@jicu8vah !jicu8vah\@another\@jicu8vah
baeb1fai@@example.com !baeb1fai\@\@example\.com
-mailto://yie6xuna !yie6xuna
+mailto://yie6xuna !yie6xuna
mailto://yie6xuna@nottld !yie6xuna\@nottld
<se...@verper.com> !^http://.*addr\.com\@verper\.com
-<se...@verper.com> mailto:sentto-4934-foo=addr\.com\@verper\.com
+<se...@verper.com> ^mailto:sentto-4934-foo=addr\.com\@verper\.com
http://foo23498.com/{ESC}(B ^http://foo23498\.com/$
{ESC}(Bhttp://foo23499.com/ ^http://foo23499\.com/$
http://foo23500.com{ESC}(B/ ^http://foo23500\.com(?:/?)$
M0"-AE/9Y.KN:_0D2F:95^H*:I,8 !9Y\.KN
->delimtest1.com http://delimtest1\.com
-<delimtest2.com http://delimtest2\.com
-"delimtest3.com http://delimtest3\.com
-\delimtest4.com http://delimtest4\.com
-'delimtest5.com http://delimtest5\.com
-`delimtest6.com http://delimtest6\.com
-,delimtest7.com http://delimtest7\.com
-{delimtest8.com http://delimtest8\.com
-[delimtest9.com http://delimtest9\.com
-(delimtest10.com http://delimtest10\.com
-|delimtest11.com http://delimtest11\.com
- delimtest12.com http://delimtest12\.com
-ignorethishttp://delimtest13.org http://delimtest13\.org
+>delimtest1.com ^http://delimtest1\.com
+<delimtest2.com ^http://delimtest2\.com
+"delimtest3.com ^http://delimtest3\.com
+\delimtest4.com ^http://delimtest4\.com
+'delimtest5.com ^http://delimtest5\.com
+`delimtest6.com ^http://delimtest6\.com
+,delimtest7.com ^http://delimtest7\.com
+{delimtest8.com ^http://delimtest8\.com
+[delimtest9.com ^http://delimtest9\.com
+(delimtest10.com ^http://delimtest10\.com
+|delimtest11.com ^http://delimtest11\.com
+ delimtest12.com ^http://delimtest12\.com
+ignorethishttp://delimtest13.org ^http://delimtest13\.org
donotignorethiswww.delimtest14.com donotignorethiswww\.delimtest14\.com
<www.delimtest15.com/foo-~!@#^&*()_+=:;'?,.xyz-~!@#^&*()_+=:;'?,.> ^http://www\.delimtest15\.com/foo-~!\@#\^&\*\(\)_\+=:;'\?,\.xyz$
.....www.delimtest16.com.......... ^http://www\.delimtest16\.com$
@@ -293,7 +306,7 @@ donotignorethiswww.delimtest14.com donot
# emails with a comma at the end
test@delimtest20.com,stuff stuff delimtest20\.com
-# check all the TLDs (might as well be thorough)
+# check some TLDs, no point testing all here
# the inactive TLDs have negative checks
# first confirm that it will not match on not a TLD
@@ -301,275 +314,12 @@ example.invalid !^http://example\.invali
example.zzf !^http://example\.zzf$
example.ac ^http://example\.ac$
-example.ad ^http://example\.ad$
-example.ae ^http://example\.ae$
-example.aero ^http://example\.aero$
-example.af ^http://example\.af$
-example.ag ^http://example\.ag$
-example.ai ^http://example\.ai$
-example.al ^http://example\.al$
-example.am ^http://example\.am$
-example.an !^http://example\.an$
-example.ao ^http://example\.ao$
-example.aq ^http://example\.aq$
-example.ar ^http://example\.ar$
-example.arpa ^http://example\.arpa$
-example.as ^http://example\.as$
-example.asia ^http://example\.asia$
-example.at ^http://example\.at$
-example.au ^http://example\.au$
-example.aw ^http://example\.aw$
-example.ax ^http://example\.ax$
-example.az ^http://example\.az$
-example.ba ^http://example\.ba$
-example.bb ^http://example\.bb$
-example.bd ^http://example\.bd$
-example.be ^http://example\.be$
-example.bf ^http://example\.bf$
-example.bg ^http://example\.bg$
-example.bh ^http://example\.bh$
-example.bi ^http://example\.bi$
-example.biz ^http://example\.biz$
-example.bj ^http://example\.bj$
-example.bm ^http://example\.bm$
-example.bn ^http://example\.bn$
-example.bo ^http://example\.bo$
-example.br ^http://example\.br$
-example.bs ^http://example\.bs$
-example.bt ^http://example\.bt$
-example.bv ^http://example\.bv$
-example.bw ^http://example\.bw$
-example.by ^http://example\.by$
-example.bz ^http://example\.bz$
-example.ca ^http://example\.ca$
-example.cat ^http://example\.cat$
-example.cc ^http://example\.cc$
-example.cd ^http://example\.cd$
-example.cf ^http://example\.cf$
-example.cg ^http://example\.cg$
-example.ch ^http://example\.ch$
-example.ci ^http://example\.ci$
-example.ck ^http://example\.ck$
-example.cl ^http://example\.cl$
-example.cm ^http://example\.cm$
-example.cn ^http://example\.cn$
-example.co ^http://example\.co$
-example.com ^http://example\.com$
-example.coop ^http://example\.coop$
-example.cr ^http://example\.cr$
-example.cu ^http://example\.cu$
-example.cv ^http://example\.cv$
-example.cx ^http://example\.cx$
-example.cy ^http://example\.cy$
-example.cz ^http://example\.cz$
-example.de ^http://example\.de$
-example.dj ^http://example\.dj$
-example.dk ^http://example\.dk$
-example.dm ^http://example\.dm$
-example.do ^http://example\.do$
-example.dz ^http://example\.dz$
-example.ec ^http://example\.ec$
-example.edu ^http://example\.edu$
-example.ee ^http://example\.ee$
-example.eg ^http://example\.eg$
-example.er ^http://example\.er$
-example.es ^http://example\.es$
-example.et ^http://example\.et$
example.eu ^http://example\.eu$
example.fi ^http://example\.fi$
-example.fj ^http://example\.fj$
-example.fk ^http://example\.fk$
-example.fm ^http://example\.fm$
-example.fo ^http://example\.fo$
-example.fr ^http://example\.fr$
-example.ga ^http://example\.ga$
-example.gb ^http://example\.gb$
-example.gd ^http://example\.gd$
-example.ge ^http://example\.ge$
-example.gf ^http://example\.gf$
-example.gg ^http://example\.gg$
-example.gh ^http://example\.gh$
-example.gi ^http://example\.gi$
-example.gl ^http://example\.gl$
-example.gm ^http://example\.gm$
-example.gn ^http://example\.gn$
-example.gov ^http://example\.gov$
-example.gp ^http://example\.gp$
-example.gq ^http://example\.gq$
-example.gr ^http://example\.gr$
-example.gs ^http://example\.gs$
-example.gt ^http://example\.gt$
-example.gu ^http://example\.gu$
-example.gw ^http://example\.gw$
-example.gy ^http://example\.gy$
-example.hk ^http://example\.hk$
-example.hm ^http://example\.hm$
-example.hn ^http://example\.hn$
-example.hr ^http://example\.hr$
-example.ht ^http://example\.ht$
-example.hu ^http://example\.hu$
-example.id ^http://example\.id$
-example.ie ^http://example\.ie$
-example.il ^http://example\.il$
-example.im ^http://example\.im$
-example.in ^http://example\.in$
-example.info ^http://example\.info$
-example.int ^http://example\.int$
-example.io ^http://example\.io$
-example.iq ^http://example\.iq$
-example.ir ^http://example\.ir$
-example.is ^http://example\.is$
-example.it ^http://example\.it$
-example.je ^http://example\.je$
-example.jm ^http://example\.jm$
-example.jo ^http://example\.jo$
-example.jobs ^http://example\.jobs$
-example.jp ^http://example\.jp$
-example.ke ^http://example\.ke$
-example.kg ^http://example\.kg$
-example.kh ^http://example\.kh$
-example.ki ^http://example\.ki$
-example.km ^http://example\.km$
-example.kn ^http://example\.kn$
-example.kp ^http://example\.kp$
-example.kr ^http://example\.kr$
-example.kw ^http://example\.kw$
-example.ky ^http://example\.ky$
-example.kz ^http://example\.kz$
-example.la ^http://example\.la$
-example.lb ^http://example\.lb$
-example.lc ^http://example\.lc$
-example.li ^http://example\.li$
-example.lk ^http://example\.lk$
-example.lr ^http://example\.lr$
-example.ls ^http://example\.ls$
-example.lt ^http://example\.lt$
-example.lu ^http://example\.lu$
-example.lv ^http://example\.lv$
-example.ly ^http://example\.ly$
-example.ma ^http://example\.ma$
-example.mc ^http://example\.mc$
-example.md ^http://example\.md$
-example.me ^http://example\.me$
-example.mg ^http://example\.mg$
-example.mh ^http://example\.mh$
-example.mil ^http://example\.mil$
-example.mk ^http://example\.mk$
-example.ml ^http://example\.ml$
-example.mm ^http://example\.mm$
-example.mn ^http://example\.mn$
-example.mo ^http://example\.mo$
-example.mobi ^http://example\.mobi$
-example.mp ^http://example\.mp$
-example.mq ^http://example\.mq$
-example.mr ^http://example\.mr$
-example.ms ^http://example\.ms$
-example.mt ^http://example\.mt$
-example.mu ^http://example\.mu$
-example.museum ^http://example\.museum$
-example.mv ^http://example\.mv$
-example.mw ^http://example\.mw$
-example.mx ^http://example\.mx$
-example.my ^http://example\.my$
-example.mz ^http://example\.mz$
-example.na ^http://example\.na$
-example.name ^http://example\.name$
-example.nc ^http://example\.nc$
-example.ne ^http://example\.ne$
-example.net ^http://example\.net$
-example.nf ^http://example\.nf$
-example.ng ^http://example\.ng$
-example.ni ^http://example\.ni$
-example.nl ^http://example\.nl$
-example.no ^http://example\.no$
-example.np ^http://example\.np$
-example.nr ^http://example\.nr$
-example.nu ^http://example\.nu$
-example.nz ^http://example\.nz$
-example.om ^http://example\.om$
-example.org ^http://example\.org$
-example.pa ^http://example\.pa$
-example.pe ^http://example\.pe$
-example.pf ^http://example\.pf$
-example.pg ^http://example\.pg$
-example.ph ^http://example\.ph$
-example.pk ^http://example\.pk$
-example.pl ^http://example\.pl$
-example.pm ^http://example\.pm$
-example.pn ^http://example\.pn$
-example.pr ^http://example\.pr$
-example.pro ^http://example\.pro$
-example.ps ^http://example\.ps$
-example.pt ^http://example\.pt$
-example.pw ^http://example\.pw$
-example.py ^http://example\.py$
-example.qa ^http://example\.qa$
-example.re ^http://example\.re$
-example.ro ^http://example\.ro$
-example.rs ^http://example\.rs$
-example.ru ^http://example\.ru$
-example.rw ^http://example\.rw$
-example.sa ^http://example\.sa$
-example.sb ^http://example\.sb$
-example.sc ^http://example\.sc$
-example.sd ^http://example\.sd$
-example.se ^http://example\.se$
-example.sg ^http://example\.sg$
-example.sh ^http://example\.sh$
-example.si ^http://example\.si$
-example.sj ^http://example\.sj$
-example.sk ^http://example\.sk$
-example.sl ^http://example\.sl$
-example.sm ^http://example\.sm$
-example.sn ^http://example\.sn$
-example.so ^http://example\.so$
-example.sr ^http://example\.sr$
-example.st ^http://example\.st$
-example.su ^http://example\.su$
-example.sv ^http://example\.sv$
-example.sy ^http://example\.sy$
-example.sz ^http://example\.sz$
-example.tc ^http://example\.tc$
-example.td ^http://example\.td$
-example.tel ^http://example\.tel$
-example.tf ^http://example\.tf$
-example.tg ^http://example\.tg$
-example.th ^http://example\.th$
-example.tj ^http://example\.tj$
-example.tk ^http://example\.tk$
-example.tl ^http://example\.tl$
-example.tm ^http://example\.tm$
-example.tn ^http://example\.tn$
-example.to ^http://example\.to$
example.tp !^http://example\.tp$
-example.tr ^http://example\.tr$
example.travel ^http://example\.travel$
-example.tt ^http://example\.tt$
-example.tv ^http://example\.tv$
-example.tw ^http://example\.tw$
-example.tz ^http://example\.tz$
-example.ua ^http://example\.ua$
-example.ug ^http://example\.ug$
-example.uk ^http://example\.uk$
example.um !^http://example\.um$
example.us ^http://example\.us$
-example.uy ^http://example\.uy$
-example.uz ^http://example\.uz$
-example.va ^http://example\.va$
-example.vc ^http://example\.vc$
-example.ve ^http://example\.ve$
-example.vg ^http://example\.vg$
-example.vi ^http://example\.vi$
-example.vn ^http://example\.vn$
-example.vu ^http://example\.vu$
-example.wf ^http://example\.wf$
-example.ws ^http://example\.ws$
-example.ye ^http://example\.ye$
-example.yt ^http://example\.yt$
-example.yu !^http://example\.yu$
-example.za ^http://example\.za$
-example.zm ^http://example\.zm$
-example.zw ^http://example\.zw$
# with www. prefix tests a different table of TLDs
@@ -577,272 +327,12 @@ www.example.foo ^http://www\.example\.fo
www.example.zzf !^http://www\.example\.zzf$
www.example.ac ^http://www\.example\.ac$
-www.example.ad ^http://www\.example\.ad$
-www.example.ae ^http://www\.example\.ae$
-www.example.aero ^http://www\.example\.aero$
-www.example.af ^http://www\.example\.af$
-www.example.ag ^http://www\.example\.ag$
-www.example.ai ^http://www\.example\.ai$
-www.example.al ^http://www\.example\.al$
-www.example.am ^http://www\.example\.am$
www.example.an !^http://www\.example\.an$
www.example.ao ^http://www\.example\.ao$
-www.example.aq ^http://www\.example\.aq$
-www.example.ar ^http://www\.example\.ar$
www.example.arpa ^http://www\.example\.arpa$
-www.example.as ^http://www\.example\.as$
-www.example.asia ^http://www\.example\.asia$
-www.example.at ^http://www\.example\.at$
-www.example.au ^http://www\.example\.au$
-www.example.aw ^http://www\.example\.aw$
-www.example.ax ^http://www\.example\.ax$
-www.example.az ^http://www\.example\.az$
-www.example.ba ^http://www\.example\.ba$
-www.example.bb ^http://www\.example\.bb$
-www.example.bd ^http://www\.example\.bd$
-www.example.be ^http://www\.example\.be$
-www.example.bf ^http://www\.example\.bf$
-www.example.bg ^http://www\.example\.bg$
-www.example.bh ^http://www\.example\.bh$
-www.example.bi ^http://www\.example\.bi$
-www.example.biz ^http://www\.example\.biz$
-www.example.bj ^http://www\.example\.bj$
-www.example.bm ^http://www\.example\.bm$
-www.example.bn ^http://www\.example\.bn$
-www.example.bo ^http://www\.example\.bo$
-www.example.br ^http://www\.example\.br$
-www.example.bs ^http://www\.example\.bs$
-www.example.bt ^http://www\.example\.bt$
-www.example.bv ^http://www\.example\.bv$
-www.example.bw ^http://www\.example\.bw$
-www.example.by ^http://www\.example\.by$
-www.example.bz ^http://www\.example\.bz$
-www.example.ca ^http://www\.example\.ca$
-www.example.cat ^http://www\.example\.cat$
-www.example.cc ^http://www\.example\.cc$
-www.example.cd ^http://www\.example\.cd$
-www.example.cf ^http://www\.example\.cf$
-www.example.cg ^http://www\.example\.cg$
-www.example.ch ^http://www\.example\.ch$
www.example.ci ^http://www\.example\.ci$
-www.example.ck ^http://www\.example\.ck$
-www.example.cl ^http://www\.example\.cl$
-www.example.cm ^http://www\.example\.cm$
-www.example.cn ^http://www\.example\.cn$
-www.example.co ^http://www\.example\.co$
-www.example.com ^http://www\.example\.com$
-www.example.coop ^http://www\.example\.coop$
-www.example.cr ^http://www\.example\.cr$
-www.example.cu ^http://www\.example\.cu$
-www.example.cv ^http://www\.example\.cv$
-www.example.cx ^http://www\.example\.cx$
-www.example.cy ^http://www\.example\.cy$
-www.example.cz ^http://www\.example\.cz$
-www.example.de ^http://www\.example\.de$
-www.example.dj ^http://www\.example\.dj$
-www.example.dk ^http://www\.example\.dk$
-www.example.dm ^http://www\.example\.dm$
-www.example.do ^http://www\.example\.do$
-www.example.dz ^http://www\.example\.dz$
-www.example.ec ^http://www\.example\.ec$
www.example.edu ^http://www\.example\.edu$
-www.example.ee ^http://www\.example\.ee$
-www.example.eg ^http://www\.example\.eg$
-www.example.er ^http://www\.example\.er$
-www.example.es ^http://www\.example\.es$
-www.example.et ^http://www\.example\.et$
-www.example.eu ^http://www\.example\.eu$
-www.example.fi ^http://www\.example\.fi$
-www.example.fj ^http://www\.example\.fj$
-www.example.fk ^http://www\.example\.fk$
-www.example.fm ^http://www\.example\.fm$
-www.example.fo ^http://www\.example\.fo$
-www.example.fr ^http://www\.example\.fr$
-www.example.ga ^http://www\.example\.ga$
-www.example.gb ^http://www\.example\.gb$
-www.example.gd ^http://www\.example\.gd$
-www.example.ge ^http://www\.example\.ge$
-www.example.gf ^http://www\.example\.gf$
-www.example.gg ^http://www\.example\.gg$
-www.example.gh ^http://www\.example\.gh$
-www.example.gi ^http://www\.example\.gi$
-www.example.gl ^http://www\.example\.gl$
-www.example.gm ^http://www\.example\.gm$
-www.example.gn ^http://www\.example\.gn$
-www.example.gov ^http://www\.example\.gov$
-www.example.gp ^http://www\.example\.gp$
-www.example.gq ^http://www\.example\.gq$
-www.example.gr ^http://www\.example\.gr$
-www.example.gs ^http://www\.example\.gs$
-www.example.gt ^http://www\.example\.gt$
-www.example.gu ^http://www\.example\.gu$
-www.example.gw ^http://www\.example\.gw$
-www.example.gy ^http://www\.example\.gy$
-www.example.hk ^http://www\.example\.hk$
-www.example.hm ^http://www\.example\.hm$
-www.example.hn ^http://www\.example\.hn$
-www.example.hr ^http://www\.example\.hr$
-www.example.ht ^http://www\.example\.ht$
-www.example.hu ^http://www\.example\.hu$
-www.example.id ^http://www\.example\.id$
-www.example.ie ^http://www\.example\.ie$
-www.example.il ^http://www\.example\.il$
-www.example.im ^http://www\.example\.im$
-www.example.in ^http://www\.example\.in$
-www.example.info ^http://www\.example\.info$
-www.example.int ^http://www\.example\.int$
-www.example.io ^http://www\.example\.io$
-www.example.iq ^http://www\.example\.iq$
-www.example.ir ^http://www\.example\.ir$
-www.example.is ^http://www\.example\.is$
-www.example.it ^http://www\.example\.it$
-www.example.je ^http://www\.example\.je$
-www.example.jm ^http://www\.example\.jm$
-www.example.jo ^http://www\.example\.jo$
-www.example.jobs ^http://www\.example\.jobs$
-www.example.jp ^http://www\.example\.jp$
-www.example.ke ^http://www\.example\.ke$
-www.example.kg ^http://www\.example\.kg$
-www.example.kh ^http://www\.example\.kh$
-www.example.ki ^http://www\.example\.ki$
-www.example.km ^http://www\.example\.km$
-www.example.kn ^http://www\.example\.kn$
-www.example.kp ^http://www\.example\.kp$
-www.example.kr ^http://www\.example\.kr$
-www.example.kw ^http://www\.example\.kw$
-www.example.ky ^http://www\.example\.ky$
-www.example.kz ^http://www\.example\.kz$
-www.example.la ^http://www\.example\.la$
-www.example.lb ^http://www\.example\.lb$
-www.example.lc ^http://www\.example\.lc$
-www.example.li ^http://www\.example\.li$
-www.example.lk ^http://www\.example\.lk$
-www.example.lr ^http://www\.example\.lr$
-www.example.ls ^http://www\.example\.ls$
-www.example.lt ^http://www\.example\.lt$
-www.example.lu ^http://www\.example\.lu$
-www.example.lv ^http://www\.example\.lv$
-www.example.ly ^http://www\.example\.ly$
-www.example.ma ^http://www\.example\.ma$
-www.example.mc ^http://www\.example\.mc$
-www.example.md ^http://www\.example\.md$
-www.example.me ^http://www\.example\.me$
-www.example.mg ^http://www\.example\.mg$
-www.example.mh ^http://www\.example\.mh$
-www.example.mil ^http://www\.example\.mil$
-www.example.mk ^http://www\.example\.mk$
-www.example.ml ^http://www\.example\.ml$
-www.example.mm ^http://www\.example\.mm$
-www.example.mn ^http://www\.example\.mn$
-www.example.mo ^http://www\.example\.mo$
-www.example.mobi ^http://www\.example\.mobi$
-www.example.mp ^http://www\.example\.mp$
-www.example.mq ^http://www\.example\.mq$
-www.example.mr ^http://www\.example\.mr$
-www.example.ms ^http://www\.example\.ms$
-www.example.mt ^http://www\.example\.mt$
-www.example.mu ^http://www\.example\.mu$
-www.example.museum ^http://www\.example\.museum$
-www.example.mv ^http://www\.example\.mv$
-www.example.mw ^http://www\.example\.mw$
-www.example.mx ^http://www\.example\.mx$
-www.example.my ^http://www\.example\.my$
-www.example.mz ^http://www\.example\.mz$
-www.example.na ^http://www\.example\.na$
-www.example.name ^http://www\.example\.name$
-www.example.nc ^http://www\.example\.nc$
-www.example.ne ^http://www\.example\.ne$
-www.example.net ^http://www\.example\.net$
-www.example.nf ^http://www\.example\.nf$
-www.example.ng ^http://www\.example\.ng$
-www.example.ni ^http://www\.example\.ni$
-www.example.nl ^http://www\.example\.nl$
-www.example.no ^http://www\.example\.no$
-www.example.np ^http://www\.example\.np$
-www.example.nr ^http://www\.example\.nr$
-www.example.nu ^http://www\.example\.nu$
-www.example.nz ^http://www\.example\.nz$
-www.example.om ^http://www\.example\.om$
-www.example.org ^http://www\.example\.org$
-www.example.pa ^http://www\.example\.pa$
-www.example.pe ^http://www\.example\.pe$
-www.example.pf ^http://www\.example\.pf$
-www.example.pg ^http://www\.example\.pg$
-www.example.ph ^http://www\.example\.ph$
-www.example.pk ^http://www\.example\.pk$
-www.example.pl ^http://www\.example\.pl$
-www.example.pm ^http://www\.example\.pm$
-www.example.pn ^http://www\.example\.pn$
-www.example.pr ^http://www\.example\.pr$
-www.example.pro ^http://www\.example\.pro$
-www.example.ps ^http://www\.example\.ps$
-www.example.pt ^http://www\.example\.pt$
-www.example.pw ^http://www\.example\.pw$
-www.example.py ^http://www\.example\.py$
-www.example.qa ^http://www\.example\.qa$
-www.example.re ^http://www\.example\.re$
-www.example.ro ^http://www\.example\.ro$
-www.example.rs ^http://www\.example\.rs$
-www.example.ru ^http://www\.example\.ru$
-www.example.rw ^http://www\.example\.rw$
-www.example.sa ^http://www\.example\.sa$
-www.example.sb ^http://www\.example\.sb$
-www.example.sc ^http://www\.example\.sc$
-www.example.sd ^http://www\.example\.sd$
-www.example.se ^http://www\.example\.se$
-www.example.sg ^http://www\.example\.sg$
-www.example.sh ^http://www\.example\.sh$
-www.example.si ^http://www\.example\.si$
-www.example.sj ^http://www\.example\.sj$
-www.example.sk ^http://www\.example\.sk$
-www.example.sl ^http://www\.example\.sl$
-www.example.sm ^http://www\.example\.sm$
-www.example.sn ^http://www\.example\.sn$
-www.example.so ^http://www\.example\.so$
-www.example.sr ^http://www\.example\.sr$
-www.example.st ^http://www\.example\.st$
-www.example.su ^http://www\.example\.su$
-www.example.sv ^http://www\.example\.sv$
-www.example.sy ^http://www\.example\.sy$
-www.example.sz ^http://www\.example\.sz$
-www.example.tc ^http://www\.example\.tc$
-www.example.td ^http://www\.example\.td$
-www.example.tel ^http://www\.example\.tel$
-www.example.tf ^http://www\.example\.tf$
-www.example.tg ^http://www\.example\.tg$
-www.example.th ^http://www\.example\.th$
-www.example.tj ^http://www\.example\.tj$
-www.example.tk ^http://www\.example\.tk$
-www.example.tl ^http://www\.example\.tl$
-www.example.tm ^http://www\.example\.tm$
-www.example.tn ^http://www\.example\.tn$
-www.example.to ^http://www\.example\.to$
www.example.tp !^http://www\.example\.tp$
-www.example.tr ^http://www\.example\.tr$
-www.example.travel ^http://www\.example\.travel$
-www.example.tt ^http://www\.example\.tt$
-www.example.tv ^http://www\.example\.tv$
-www.example.tw ^http://www\.example\.tw$
-www.example.tz ^http://www\.example\.tz$
-www.example.ua ^http://www\.example\.ua$
-www.example.ug ^http://www\.example\.ug$
-www.example.uk ^http://www\.example\.uk$
-www.example.um !^http://www\.example\.um$
-www.example.us ^http://www\.example\.us$
-www.example.uy ^http://www\.example\.uy$
-www.example.uz ^http://www\.example\.uz$
-www.example.va ^http://www\.example\.va$
-www.example.vc ^http://www\.example\.vc$
-www.example.ve ^http://www\.example\.ve$
-www.example.vg ^http://www\.example\.vg$
-www.example.vi ^http://www\.example\.vi$
-www.example.vn ^http://www\.example\.vn$
-www.example.vu ^http://www\.example\.vu$
-www.example.wf ^http://www\.example\.wf$
www.example.ws ^http://www\.example\.ws$
-www.example.ye ^http://www\.example\.ye$
-www.example.yt ^http://www\.example\.yt$
www.example.yu !^http://www\.example\.yu$
www.example.za ^http://www\.example\.za$
-www.example.zm ^http://www\.example\.zm$
-www.example.zw ^http://www\.example\.zw$