You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by si...@apache.org on 2005/05/09 17:55:07 UTC

svn commit: r169334 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm lib/Mail/SpamAssassin/Util.pm rules/20_uri_tests.cf t/uri.t t/uri_html.t

Author: sidney
Date: Mon May  9 08:55:06 2005
New Revision: 169334

URL: http://svn.apache.org/viewcvs?rev=169334&view=rev
Log:
bug 4176 see comments in bug report for details

Added:
    spamassassin/trunk/t/uri_html.t   (with props)
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
    spamassassin/trunk/rules/20_uri_tests.cf
    spamassassin/trunk/t/uri.t

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Mon May  9 08:55:06 2005
@@ -416,6 +416,7 @@
 t/stripmarkup.t
 t/test_dir
 t/uri.t
+t/uri_html.t
 t/uri_text.t
 t/utf8.t
 t/whitelist_addrs.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Mon May  9 08:55:06 2005
@@ -1579,6 +1579,40 @@
     }
   });
 
+=item redirector_pattern	/pattern/
+
+A regex pattern that matches both the redirector site portion, and
+the target site portion of a URI.
+
+Note: The target URI portion must be surrounded in parentheses and
+      no other part of the pattern may create a backreference.
+
+Example: http://chkpt.zdnet.com/chkpt/whatever/spammer.domain/yo/dude
+
+  redirector_pattern	/^(?:http://)?chkpt\.zdnet\.com/chkpt/\w+/(.*)$/
+
+=cut
+
+  push (@cmds, {
+    setting => 'redirector_pattern',
+    is_priv => 1,
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+
+      # strip off the leading and trailing slashes
+      # we only ask for them to be like normal rules
+      $value =~ s/^\/(.*)\/$/$1/;
+
+      if (Mail::SpamAssassin::Conf::Parser->is_regexp_valid("redirector_pattern", $value)) {
+	# since the regexp will never change we might as well qr it
+	$value = qr/$value/i;
+
+	push @{$self->{main}->{conf}->{redirector_patterns}}, $value;
+	dbg("config: adding redirector regex: " . $value);
+      }
+    }
+  });
+
 =item header SYMBOLIC_TEST_NAME header op /pattern/modifiers	[if-unset: STRING]
 
 Define a test.  C<SYMBOLIC_TEST_NAME> is a symbolic test name, such as

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Mon May  9 08:55:06 2005
@@ -134,18 +134,6 @@
 
   # add the canonified version of each uri to the detail list
   if (defined $self->{uri}) {
-    while(my($uri, $info) = each %{ $self->{uri} }) {
-      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($uri);
-      $info->{cleaned} = \@tmp;
-      # list out the URLs for debugging ...
-      if (would_log('dbg', 'uri')) {
-        dbg("uri: html uri found, $uri");
-        foreach my $nuri (@tmp) {
-          dbg("uri: cleaned html uri, $nuri");
-        }
-      }
-    }
-
     @uri = keys %{$self->{uri}};
   }
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Mon May  9 08:55:06 2005
@@ -1864,12 +1864,29 @@
   # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
   # which does the metadata stuff ...  DO THIS BEFORE LOOKING FOR METADATA!!!
   my @uris = $self->get_parsed_uri_list();
+  my $redirector_patterns = $self->{conf}->{redirector_patterns};
+  @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, @uris);
 
   # get URIs from HTML parsing
   # use the metadata version as $self->{html} is probably not set yet
   if (defined $self->{msg}->{metadata}->{html}->{uri_detail}) {
     while(my($uri, $info) = each %{ $self->{msg}->{metadata}->{html}->{uri_detail} }) {
-      push(@uris, @{$info->{cleaned}});
+      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+      $info->{cleaned} = \@tmp;
+      push(@uris, @tmp);
+      if (would_log('dbg', 'uri')) {
+        dbg("uri: html uri found, $uri");
+        foreach my $nuri (@tmp) {
+          dbg("uri: cleaned html uri, $nuri");
+        }
+      }
+    }
+  }
+
+  # list out the URLs for debugging ...
+  if (would_log('dbg', 'uri')) {
+    foreach my $nuri (@uris) {
+      dbg("uri: parsed uri found: $nuri");
     }
   }
 
@@ -1953,20 +1970,10 @@
         push @uris, $uri;
       }
     }
-
-    @uris = Mail::SpamAssassin::Util::uri_list_canonify(@uris);
-
     # setup the cache and return
     $self->{parsed_uri_list} = \@uris;
 
-    # list out the URLs for debugging ...
-    if (would_log('dbg', 'uri')) {
-      foreach my $nuri (@uris) {
-        dbg("uri: parsed uri found: $nuri");
-      }
-    }
   }
-
   return @{$self->{parsed_uri_list}};
 }
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Mon May  9 08:55:06 2005
@@ -196,7 +196,7 @@
   # IMPORTANT: to get the html parsed into metadata, we need to call
   # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
   # which does the metadata stuff ...  DO THIS BEFORE SETTING $html !!!
-  my @parsed = $scanner->get_parsed_uri_list();
+  my @parsed = $scanner->get_uri_list();
 
   # Generate the full list of html-parsed domains.
   my $html = $scanner->{msg}->{metadata}->{html}->{uri_detail} || { };

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Mon May  9 08:55:06 2005
@@ -921,7 +921,7 @@
 }
 
 sub uri_list_canonify {
-  my (@uris) = @_;
+  my($redirector_patterns, @uris) = @_;
 
   # make sure we catch bad encoding tricks
   my @nuris = ();
@@ -993,6 +993,20 @@
       # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
       if ($rest =~ m{(https?:/{0,2}.+)$}i) {
         push(@uris, $1);
+      }
+
+      # resort to redirector pattern matching if the generic https? check
+      # doesn't result in a match -- bug 4176
+      else {
+	foreach (@{$redirector_patterns}) {
+	  if ("$proto$host$rest" =~ $_) {
+	    next unless defined $1;
+	    dbg("uri: parsed uri pattern: $_");
+	    dbg("uri: parsed uri found: $1 in redirector: $proto$host$rest");
+	    push (@uris, $1);
+	    last;
+	  }
+	}
       }
 
       ########################

Modified: spamassassin/trunk/rules/20_uri_tests.cf
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/20_uri_tests.cf?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/rules/20_uri_tests.cf (original)
+++ spamassassin/trunk/rules/20_uri_tests.cf Mon May  9 08:55:06 2005
@@ -24,6 +24,10 @@
 
 require_version @@VERSION@@
 
+# Redirector URI patterns
+redirector_pattern     /^(?:http://)?chkpt\.zdnet\.com/chkpt/\w+/(.*)$/
+redirector_pattern     /^(?:http://)?www\.nate\.com/r/\w+/(.*)$/
+
 uri NUMERIC_HTTP_ADDR		/^https?\:\/\/\d{7}/is
 describe NUMERIC_HTTP_ADDR	Uses a numeric IP address in URL
 

Modified: spamassassin/trunk/t/uri.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri.t?rev=169334&r1=169333&r2=169334&view=diff
==============================================================================
--- spamassassin/trunk/t/uri.t (original)
+++ spamassassin/trunk/t/uri.t Mon May  9 08:55:06 2005
@@ -22,7 +22,7 @@
 use Mail::SpamAssassin::HTML;
 use Mail::SpamAssassin::Util;
 
-plan tests => 77;
+plan tests => 78;
 
 ##############################################
 
@@ -106,7 +106,10 @@
 
 sub try_canon {
   my($input, $expect) = @_;
-  my @input = sort { $a cmp $b } Mail::SpamAssassin::Util::uri_list_canonify(@{$input});
+  my $redirs = ['/^(?:http://)?chkpt\.zdnet\.com/chkpt/\w+/(.*)$/',
+                '/^(?:http://)?www\.nate\.com/r/\w+/(.*)$/',
+               ];
+  my @input = sort { $a cmp $b } Mail::SpamAssassin::Util::uri_list_canonify($redirs, @{$input});
   my @expect = sort { $a cmp $b } @{$expect};
 
   # output what we want/get for debugging
@@ -155,6 +158,13 @@
    ], [
    'http%3A//ebg&vosxfov.com%2Emunged-%72xspecials%2Enet/b/Tr3f0amG',
    'http://ebg&vosxfov.com.munged-rxspecials.net/b/Tr3f0amG'
+   ]));
+
+ok(try_canon([
+   'http://www.nate.com/r/DM03/n%65verp4%79re%74%61%69%6c%2eco%6d/%62%61m/?m%61%6e=%6Di%634%39'
+   ], [
+   'http://www.nate.com/r/DM03/n%65verp4%79re%74%61%69%6c%2eco%6d/%62%61m/?m%61%6e=%6Di%634%39',
+   'http://www.nate.com/r/DM03/neverp4yretail.com/bam/?man=mic49',
    ]));
 
 ##############################################

Added: spamassassin/trunk/t/uri_html.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri_html.t?rev=169334&view=auto
==============================================================================
--- spamassassin/trunk/t/uri_html.t (added)
+++ spamassassin/trunk/t/uri_html.t Mon May  9 08:55:06 2005
@@ -0,0 +1,241 @@
+#!/usr/bin/perl -w
+
+# test URI redirecton patterns
+
+BEGIN {
+  if (-e 't/test_dir') { # if we are running "t/rule_names.t", kluge around ...
+    chdir 't';
+  }
+
+  if (-e 'test_dir') {            # running from test directory, not ..
+    unshift(@INC, '../blib/lib');
+  }
+}
+
+my $prefix = '.';
+if (-e 'test_dir') {            # running from test directory, not ..
+  $prefix = '..';
+}
+
+use strict;
+use SATest; sa_t_init("uri_html");
+use Test;
+use Mail::SpamAssassin;
+use IO::File;
+use vars qw(%patterns %anti_patterns);
+
+# settings
+plan tests => 2;
+
+# initialize SpamAssassin
+my $sa = create_saobj({'dont_copy_prefs' => 1});
+
+$sa->init(0); # parse rules
+
+# load tests and write mail
+my $mail = 'log/uri_html.eml';
+%patterns = ();
+%anti_patterns = ();
+write_mail();
+
+# test message
+my $fh = IO::File->new_tmpfile();
+open(STDERR, ">&=".fileno($fh)) || die "Cannot reopen STDERR";
+ok(sarun("-t --debug=uri < log/uri_html.eml"));
+seek($fh, 0, 0);
+my $error = do {
+    local $/;
+    <$fh>;
+};
+$error =~ s/^.*dbg: uri: parsed uri found: //mg;
+
+# run patterns and anti-patterns
+my $failures = 0;
+for my $pattern (keys %patterns) {
+  if ($error !~ /${pattern}/m) {
+    print "did not find $pattern\n";
+#    print "found $error\n";
+    $failures++;
+  } else {
+#    print "success $pattern in $error\n";
+  }
+}
+for my $anti_pattern (keys %anti_patterns) {
+  if ($error =~ /${anti_pattern}/m) {
+    print "did find $anti_pattern\n";
+    $failures++;
+  }
+}
+ok(!$failures);
+
+# function to write test email
+sub write_mail {
+  if (open(MAIL, ">$mail")) {
+    print MAIL <<'EOF';
+Message-ID: <cl...@example.com>
+Date: Mon, 07 Oct 2002 09:00:00 +0000
+From: Sender <se...@example.com>
+MIME-Version: 1.0
+To: Recipient <re...@example.com>
+Subject: this is a trivial message
+Content-Type: multipart/related;
+	boundary="--IDYGGVGT_LIYGR"
+
+----IDYGGVGT_LIYGR
+Content-Type: text/plain
+Content-Transfer-Encoding: 7bit
+
+This text part is ignored
+http://www.dontputthisinthetestdata.com
+
+----IDYGGVGT_LIYGR
+Content-Type: text/html; charset=us-ascii
+Content-Transfer-Encoding: 8bit
+
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text; charset=iso-8859-1">
+</head>
+<body>
+EOF
+    while (<DATA>) {
+      chomp;
+      next if /^#/;
+      if (/^(.*?)\t+(.*?)\s*$/) {
+	my $string = $1;
+	my @patterns = split(' ', $2);
+	if ($string && @patterns) {
+	  print MAIL "<a href=$string>click here</a>\n";
+	  for my $pattern (@patterns) {
+	    if ($pattern =~ /^\!(.*)/) {
+	      $anti_patterns{$1} = 1;
+	    }
+	    else {
+	      $patterns{$pattern} = 1;
+	    }
+	  }
+	}
+      }
+    }
+    print MAIL "</body>\n</html>\n\n----IDYGGVGT_LIYGR--\n";
+    close(MAIL);
+  }
+  else {
+    die "can't open output file: $!";
+  }
+}
+
+# <line>    : <string><tabs><matches>
+# <string>  : string in the body
+# <tabs>    : one or more tabs
+# <matches> : patterns expected to be found in URI output, if preceded by ! if
+#             it is an antipattern, each pattern is separated by whitespace
+__DATA__
+www5.poh6feib.com	poh6feib
+vau6yaer.com		vau6yaer
+www5.poh6feib.info	poh6feib
+Haegh3de.co.uk		Haegh3de
+
+ftp.yeinaix3.co.uk	ftp://ftp.yeinaix3.co.uk !http://ftp.yeinaix3.co.uk
+ftp5.riexai5r.co.uk	http://ftp5.riexai5r.co.uk !ftp://ftp5.riexai5r.co.uk
+
+http://10.1.3.1/	10.1.3.1
+
+=www.deiJ1pha.com	www.deiJ1pha.com
+@www.Te0xohxu.com	www.Te0xohxu.com
+.www.kuiH5sai.com	www.kuiH5sai.com
+
+a=www.zaiNgoo7.com	www.zaiNgoo7.com
+c.www.moSaoga8.com	www.moSaoga8.com
+
+http://www.example.com/about/wahfah7d.html	wahfah7d
+http://www.example.com?xa1kaLuo			\?xa1kaLuo
+http://www.lap7thob.com/			^http://www.lap7thob.com/$
+
+www.phoh1Koh.com/			^www.phoh1Koh.com/$
+www.Tar4caeg.com:80			http://www.Tar4caeg.com:80
+www.Coo4mowe.com:80/foo/foo.html	^www.Coo4mowe.com:80/foo/foo.html
+www.Nee2quae.com:80/			^www.Nee2quae.com:80/$
+
+HAETEI3D.com	HAETEI3D
+CUK3VEIZ.us	CUK3VEIZ
+CHAI7SAI.biz	CHAI7SAI
+VU4YAPHU.info	VU4YAPHU
+NAUVE1PH.net	NAUVE1PH
+LEIX6QUU.org	LEIX6QUU
+LOT1GOHV.ws	LOT1GOHV
+LI4JAIZI.name	LI4JAIZI
+BA1LOOXU.tv	BA1LOOXU
+yiez7too.CC	yiez7too
+huwaroo1.DE	huwaroo1
+chohza7t.JP	chohza7t
+the7zuum.BE	the7zuum
+sai6bahg.AT	sai6bahg
+leow3del.UK	leow3del
+ba5keinu.NZ	ba5keinu
+chae2shi.CN	chae2shi
+roo7kiey.TW	roo7kiey
+
+www.Chiew0ch.COM	www.Chiew0ch.COM
+www.thohY2qu.US		www.thohY2qu.US
+www.teiP7gei.BIZ	www.teiP7gei.BIZ
+www.xohThai8.INFO	www.xohThai8.INFO
+www.haik7Ram.NET	www.haik7Ram.NET
+www.Quaes3se.ORG	www.Quaes3se.ORG
+www.Chai6tah.WS		www.Chai6tah.WS
+www.Thuoth1y.NAME	www.Thuoth1y.NAME
+www.Chieb8ge.TV		www.Chieb8ge.TV
+WWW.quus4Rok.cc		WWW.quus4Rok.cc
+WWW.maic6Hei.de		WWW.maic6Hei.de
+WWW.he4Hiize.jp		WWW.he4Hiize.jp
+WWW.Soh1toob.be		WWW.Soh1toob.be
+WWW.chahMee5.at		WWW.chahMee5.at
+WWW.peepooN0.uk		WWW.peepooN0.uk
+WWW.Kiox3phi.nz		WWW.Kiox3phi.nz
+WWW.jong3Xou.cn		WWW.jong3Xou.cn
+WWW.waeShoe0.tw		WWW.waeShoe0.tw
+
+invalid_ltd.foo		!invalid_tld
+invalid_ltd.bar		!invalid_tld
+invalid_ltd.xyzzy	!invalid_tld
+invalid_ltd.co.zz	!invalid_tld
+
+www.invalid_ltd.foo	!invalid_tld
+www.invalid_ltd.bar	!invalid_tld
+www.invalid_ltd.xyzzy	!invalid_tld
+www.invalid_ltd.co.zz	!invalid_tld
+
+command.com		command.com
+
+# IPs for www.yahoo.com
+http://66.94.230.33	http://66.94.230.33
+http://1113515555	http://66.94.230.35
+
+http://www.luzoop5k.com		http://www.luzoop5k.com
+https://www.luzoop5k.com	https://www.luzoop5k.com
+ftp://www.luzoop5k.com		ftp://www.luzoop5k.com
+mailto:www.luzoop5k.com		mailto:www.luzoop5k.com
+file://www.luzoop5k.com		file://www.luzoop5k.com
+
+# //<user>:<password>@<host>:<port>/<url-path>
+http://user:pass@jiefeet4.com:80/x/y	http://user:pass@jiefeet4.com:80/x/y
+
+puahi8si.com:80			puahi8si.com:80
+chop8tan.com:443		chop8tan.com:443
+
+ftp://name@su5queib.ca//etc/motd	ftp://name@su5queib.ca//etc/motd
+ftp://name@faikaj4t.dom/%2Fetc/motd	ftp://name@faikaj4t.dom//etc/motd
+
+#keyword:sportscar		!sportscar
+
+# test redirector pattern
+http://www.nate.com/r/DM03/n%65verp4%79re%74%61%69%6c%2eco%6d/%62%61m/?m%61%6e=%6Di%634%39	http://neverp4yretail.com/bam/[?]man=mic49
+
+# test ignoring text portion of multipart with an html part
+http://www.nowhereinthetestdata.com		!http://www.dontputhisinthetestdata.com
+
+# questionable tests
+
+mailto://cah3neun@thaihe4d.com		mailto://cah3neun@thaihe4d.com
+mailto://jicu8vah@another@jicu8vah	jicu8vah@another@jicu8vah
+

Propchange: spamassassin/trunk/t/uri_html.t
------------------------------------------------------------------------------
    svn:executable = *



Re: svn commit: r169334 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm lib/Mail/SpamAssassin/Util.pm rules/20_uri_tests.cf t/uri.t t/uri_html.t

Posted by Theo Van Dinter <fe...@kluge.net>.
On Mon, May 09, 2005 at 08:37:06PM -0700, Justin Mason wrote:
> As for the rest -- let's just check it in and hack away as Sidney
> suggested. ;)

Yeah, that's fine.  I just didn't want to start making changes and
stepping on toes.  Looking over the code some more, I don't see a reason
to not just do the HTML canonification in PMS::extract_message_metadata()
(simplifying down get_uri_list()), so I'll try to free up some time this
afternoon and get something put together.

-- 
Randomly Generated Tagline:
"... and on that side you have a 50kg kid, and that's a pretty good sized
  kid..."                  - Prof. Farr

Re: svn commit: r169334 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm lib/Mail/SpamAssassin/Util.pm rules/20_uri_tests.cf t/uri.t t/uri_html.t

Posted by Sidney Markowitz <si...@sidney.com>.
Theo Van Dinter wrote:
> Sorry to be a killjoy here.

I have no problem with the criticism, but I think I've hit the end of what
I'm going to do on this one now that it's working without breaking anything.
I'm running out of time for some schoolwork that's due in a month and will
have to concentrate on that.

The changes you are talking about are about cleaner design, and I'm +1 for
that. And now is a good time to do it, while the issues are fresh, so that
it doesn't become some awkward design embedded in old code. But I won't be
making those changes. We're still in CTR mode in 3.1, right? I was acting
like it was RTC on this one because it felt right to get feedback first when
it looked like it would require some changes to the object design, but I
don't think it needs a lot of discussion for the last cleanup, so go to it.

 -- sidney

Re: svn commit: r169334 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm lib/Mail/SpamAssassin/Util.pm rules/20_uri_tests.cf t/uri.t t/uri_html.t

Posted by Theo Van Dinter <fe...@kluge.net>.
Sorry to be a killjoy here.

On Mon, May 09, 2005 at 03:55:07PM -0000, sidney@apache.org wrote:
> +  my $redirector_patterns = $self->{conf}->{redirector_patterns};
> +  @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, @uris);

Do we consider uri_list_canonify() to be a public function?  If so, there
needs to be some form of backward compatibility maintained.  Since there
seems to be no POD for Util.pm at all, one could read that to mean it's
all considered private, but we never did finish going through and changing
the private function names so it's not clear now.

> -  my @parsed = $scanner->get_parsed_uri_list();
> +  my @parsed = $scanner->get_uri_list();

-0.7

URIBL now considers the full list of URIs as having been parsed out of
the rendered text, which messes up the priority levels somewhat.  A case can
be made that the higher priority domains will already be on the list, but it
does mean more work for the plugin (more URIs to go through).

If we're not changing get_uri_list() (or more likely making a new
function) to return a combined uri_detail-esque dataset, then I'd like
to see get_parsed_uri_list() left alone (ie: let it do the canonification
and get_uri_list() can skip doing it), and just add a call into URIBL to
get_uri_list (we don't care about the output) to do the canonification
of the HTML bits.

Arguably, we could just have a new "_canonify_uri_detail()" call in
PMS and avoid the rest of the get_uri_list() stuff, but ...  We know
get_uri_list() is being called elsewhere anyway, so it's not a big deal IMO.

-- 
Randomly Generated Tagline:
Even backwards, A TOYOTA is still A TOYOTA.