You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2005/05/04 06:14:23 UTC

svn commit: r168069 - in /spamassassin/trunk: lib/Mail/SpamAssassin/PerMsgStatus.pm t/uri_text.t

Author: quinlan
Date: Tue May  3 21:14:21 2005
New Revision: 168069

URL: http://svn.apache.org/viewcvs?rev=168069&view=rev
Log:
more URI extraction tweaks for a new corner case

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/t/uri_text.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=168069&r1=168068&r2=168069&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue May  3 21:14:21 2005
@@ -1789,9 +1789,14 @@
 #   $label = q/[A-Za-z\d](?:[A-Za-z\d-]{0,61}[A-Za-z\d])?/;
 #   $domain = qq<$label(?:\.$label)*>;
 #   length($host) <= 255 && $host =~ /^($domain)$/
-# massively simplified from grammar, only matches known TLDs, a single
-# dot at end of TLD works, skip ones that will match as email addresses
-my $schemelessRE = qr/(?<!.\@)\b[a-z\d]
+# changes:
+#   massively simplified from grammar, only matches known TLDs, a single
+#   dot at end of TLD works
+# negative look-behinds:
+#   (?<![a-z\d][.-]) = don't let there be more hostname behind, but
+#                      don't miss ".....www.bar.com" or "-----www.foo.com"
+#   (?<!.\@) = this will be caught by the email address regular expression
+my $schemelessRE = qr/(?<![a-z\d][.-])(?<!.\@)\b[a-z\d]
                       [a-z\d.-]{0,251}
                       \.${tldsRE}\.?\b
                       (?![a-z\d.-])

Modified: spamassassin/trunk/t/uri_text.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri_text.t?rev=168069&r1=168068&r2=168069&view=diff
==============================================================================
--- spamassassin/trunk/t/uri_text.t (original)
+++ spamassassin/trunk/t/uri_text.t Tue May  3 21:14:21 2005
@@ -57,13 +57,13 @@
 # run patterns and anti-patterns
 my $failures = 0;
 for my $pattern (keys %patterns) {
-  if ($error !~ /\Q${pattern}\E/) {
+  if ($error !~ /${pattern}/) {
     print "did not find $pattern\n";
     $failures++;
   }
 }
 for my $anti_pattern (keys %anti_patterns) {
-  if ($error =~ /\Q${anti_pattern}\E/) {
+  if ($error =~ /${anti_pattern}/) {
     print "did find $anti_pattern\n";
     $failures++;
   }
@@ -85,6 +85,8 @@
 
 EOF
     while (<DATA>) {
+      chomp;
+      next if /^#/;
       if (/^(.*?)\t+(.*?)\s*$/) {
 	my $string = $1;
 	my @patterns = split(' ', $2);
@@ -145,3 +147,5 @@
 xyz..geifoza0.com	!geifoza0
 
 joe@koja3fui.koja3fui	!koja3fui
+
+<xu...@dsj.x.thriyi.com>	mailto:xuq@dsj.x.thriyi.com	!http\S*thriyi