You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/05/01 17:51:21 UTC

svn commit: r1900462 - in /spamassassin/trunk: UPGRADE lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm sql/decodeshorturl_mysql.sql sql/decodeshorturl_pg.sql sql/decodeshorturl_sqlite.sql t/data/spam/decodeshorturl/base.eml t/decodeshorturl.t

Author: hege
Date: Sun May  1 17:51:21 2022
New Revision: 1900462

URL: http://svn.apache.org/viewvc?rev=1900462&view=rev
Log:
Clean up DecodeShortURLs code. Add MySQL/Postgres support.

Added:
    spamassassin/trunk/sql/decodeshorturl_pg.sql
Modified:
    spamassassin/trunk/UPGRADE
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
    spamassassin/trunk/sql/decodeshorturl_mysql.sql
    spamassassin/trunk/sql/decodeshorturl_sqlite.sql
    spamassassin/trunk/t/data/spam/decodeshorturl/base.eml
    spamassassin/trunk/t/decodeshorturl.t

Modified: spamassassin/trunk/UPGRADE
URL: http://svn.apache.org/viewvc/spamassassin/trunk/UPGRADE?rev=1900462&r1=1900461&r2=1900462&view=diff
==============================================================================
--- spamassassin/trunk/UPGRADE (original)
+++ spamassassin/trunk/UPGRADE Sun May  1 17:51:21 2022
@@ -196,6 +196,8 @@ Note for Users Upgrading to SpamAssassin
 
 - New DMARC policy check plugin
 
+- New DecodeShortURLs plugin
+
 Note for Users Upgrading to SpamAssassin 3.4.5
 ----------------------------------------------
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm?rev=1900462&r1=1900461&r2=1900462&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm Sun May  1 17:51:21 2022
@@ -94,7 +94,6 @@ use vars qw(@ISA);
 my $VERSION = 0.12;
 
 use constant HAS_LWP_USERAGENT => eval { require LWP::UserAgent; };
-use constant HAS_DBI => eval { require DBI; };
 
 sub dbg { my $msg = shift; return Mail::SpamAssassin::Logger::dbg("DecodeShortURLs: $msg", @_); }
 sub info { my $msg = shift; return Mail::SpamAssassin::Logger::info("DecodeShortURLs: $msg", @_); }
@@ -108,17 +107,12 @@ sub new {
   bless ($self, $class);
 
   if ($mailsaobject->{local_tests_only} || !HAS_LWP_USERAGENT) {
+    dbg("local tests only, disabling checks");
     $self->{disabled} = 1;
-  } else {
-    $self->{disabled} = 0;
   }
-
-  unless ($self->{disabled}) {
-    $self->{ua} = new LWP::UserAgent;
-    $self->{ua}->{max_redirect} = 0;
-    $self->{ua}->{timeout} = 5;
-    $self->{ua}->env_proxy;
-    $self->{caching} = 0;
+  elsif (!HAS_LWP_USERAGENT) {
+    dbg("module LWP::UserAgent not installed, disabling checks");
+    $self->{disabled} = 1;
   }
 
   $self->set_config($mailsaobject->{conf});
@@ -160,7 +154,7 @@ sub set_config {
     default => {},
     code => sub {
       my ($self, $key, $value, $line) = @_;
-      if ($value =~ /^$/) {
+      if ($value eq '') {
         return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
       }
       foreach my $domain (split(/\s+/, $value)) {
@@ -173,11 +167,12 @@ sub set_config {
 
 =item url_shortener_cache_type     (default: none)
 
-The specific type of cache type that is being utilized. Currently only sqlite
-is configured however plans to support redis cache is planned.
+The cache type that is being utilized.  Currently only supported value is
+C<dbi> that implies C<url_shortener_cache_dsn> is a DBI connect string.
+DBI module is required.
 
 Example:
-url_shortener_cache_type sqlite
+url_shortener_cache_type dbi
 
 =back
 
@@ -185,7 +180,7 @@ url_shortener_cache_type sqlite
 
   push (@cmds, {
     setting => 'url_shortener_cache_type',
-    default => undef,
+    default => '',
     is_priv => 1,
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
   });
@@ -194,15 +189,22 @@ url_shortener_cache_type sqlite
 
 =item url_shortener_cache_dsn		(default: none)
 
-The dsn to a database file to write cache entries to.  The database will
-be created automatically if is does not already exist but the supplied path
-and file must be read/writable by the user running spamassassin or spamd.
+The DBI dsn of the database to use.
 
-Note: You will need to have the proper DBI version of the cache type installed.
+For SQLite, the database will be created automatically if it does not
+already exist, the supplied path and file must be read/writable by the
+user running spamassassin or spamd.
 
-Example:
+For MySQL/MariaDB or PostgreSQL, see sql-directory for database table
+creation clauses.
+
+You will need to have the proper DBI module for your database.  For example
+DBD::SQLite, DBD::mysql, DBD::MariaDB or DBD::Pg.
 
-url_shortener_cache_dsn dbi:SQLite:dbname=/tmp/DecodeShortURLs.sq3
+Minimum required SQLite version is 3.24.0 (available from DBD::SQLite 1.59).
+
+Examples:
+url_shortener_cache_dsn dbi:SQLite:dbname=/var/lib/spamassassin/DecodeShortURLs.db
 
 =back
 
@@ -219,7 +221,8 @@ url_shortener_cache_dsn dbi:SQLite:dbnam
 
 =item url_shortener_cache_username  (default: none)
 
-The username that should be used to connect to the database.
+The username that should be used to connect to the database.  Not used for
+SQLite.
 
 =back
 
@@ -236,7 +239,8 @@ The username that should be used to conn
 
 =item url_shortener_cache_password  (default: none)
 
-The password that should be used to connect to the database.
+The password that should be used to connect to the database.  Not used for
+SQLite.
 
 =back
 
@@ -256,12 +260,7 @@ The password that should be used to conn
 The length of time a cache entry will be valid for in seconds.
 Default is 86400 (1 day).
 
-NOTE: you will also need to run the following via cron to actually remove the
-records from the database:
-
-echo "DELETE FROM short_url_cache WHERE modified < NOW() - C<ttl>; | sqlite3 /path/to/database"
-
-NOTE: replace C<ttl> above with the same value you use for this option
+See C<url_shortener_cache_autoclean> for database cleaning.
 
 =back
 
@@ -276,6 +275,28 @@ NOTE: replace C<ttl> above with the same
 
 =over 4
 
+=item url_shortener_cache_autoclean	(default: 1000)
+
+Automatically purge old entries from database.  Value describes a random run
+chance of 1/x.  The default value of 1000 means that cleaning is run
+approximately once for every 1000 messages processed.  Value of 1 would mean
+database is cleaned every time a message is processed.
+
+Set 0 to disable automatic cleaning and to do it manually.
+
+=back
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_cache_autoclean',
+    is_admin => 1,
+    default => 1000,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+  });
+
+=over 4
+
 =item url_shortener_loginfo           (default: 0 (off))
 
 If this option is enabled (set to 1), then short URLs and the decoded URLs will be logged with info priority.
@@ -312,37 +333,156 @@ The max depth of short urls that will be
 }
 
 sub initialise_url_shortener_cache {
-  my($self, $opts) = @_;
+  my ($self, $conf) = @_;
 
-  if($self->{url_shortener_cache_type} eq "dbi" && defined $self->{url_shortener_cache_dsn} && HAS_DBI) {
-    $self->{url_shortener_dbi_cache} = $self->{url_shortener_cache_dsn};
-    return _connect_dbi_cache($self, $opts);
-  } else {
-    warn "Wrong cache type selected";
+  return if $self->{dbh};
+  return if !$conf->{url_shortener_cache_type};
+
+  if (!$conf->{url_shortener_cache_dsn}) {
+    warn "DecodeShortURLs: invalid cache configuration\n";
     return;
   }
-}
 
-sub _connect_dbi_cache {
-  my($self, $opts) = @_;
-
-  # Initialise cache if enabled
-  if ($self->{url_shortener_dbi_cache} && HAS_DBI) {
+  ##
+  ## SQLite
+  ## 
+  if ($conf->{url_shortener_cache_type} =~ /^(?:dbi|sqlite)$/i
+      && $conf->{url_shortener_cache_dsn} =~ /^dbi:SQLite/)
+  {
     eval {
       local $SIG{'__DIE__'};
+      require DBI;
+      require DBD::SQLite;
+      DBD::SQLite->VERSION(1.59_01); # Required for ON CONFLICT
       $self->{dbh} = DBI->connect_cached(
-        $self->{url_shortener_cache_dsn},
-        $self->{url_shortener_cache_username},
-        $self->{url_shortener_cache_password},
-        {RaiseError => 1, PrintError => 0, InactiveDestroy => 1}
-      ) or die $!;
+        $conf->{url_shortener_cache_dsn}, '', '',
+        {RaiseError => 1, PrintError => 0, InactiveDestroy => 1, AutoCommit => 1}
+      );
+      $self->{dbh}->do("
+        CREATE TABLE IF NOT EXISTS short_url_cache (
+          short_url   TEXT PRIMARY KEY NOT NULL,
+          decoded_url TEXT NOT NULL,
+          hits        INTEGER NOT NULL DEFAULT 1,
+          created     INTEGER NOT NULL DEFAULT (strftime('%s','now')),
+          modified    INTEGER NOT NULL DEFAULT (strftime('%s','now'))
+        )
+      ");
+      # Maintaining index for cleaning is likely more expensive than occasional full table scan
+      #$self->{dbh}->do("
+      #  CREATE INDEX IF NOT EXISTS short_url_modified
+      #    ON short_url_cache(modified)
+      #");
+      $self->{sth_insert} = $self->{dbh}->prepare("
+        INSERT INTO short_url_cache (short_url, decoded_url, modified)
+        VALUES (?,?,strftime('%s','now'))
+        ON CONFLICT(short_url) DO UPDATE
+          SET decoded_url = excluded.decoded_url,
+              modified = excluded.modified,
+              hits = hits + 1
+      ");
+      $self->{sth_select} = $self->{dbh}->prepare("
+        SELECT decoded_url FROM short_url_cache
+        WHERE short_url = ? AND modified >= strftime('%s','now') - $conf->{url_shortener_cache_ttl}
+      ");
+      $self->{sth_delete} = $self->{dbh}->prepare("
+        DELETE FROM short_url_cache
+        WHERE modified < strftime('%s','now') - $conf->{url_shortener_cache_ttl}
+      ");
     };
     if ($@) {
-      dbg("warn: $@");
-    } else {
-      $self->{caching} = 1;
+      warn "DecodeShortURLs: cache connect failed: $@\n";
+      undef $self->{dbh};
+      undef $self->{sth_insert};
+      undef $self->{sth_select};
+      undef $self->{sth_delete};
+    }
+  }
+  ##
+  ## MySQL/MariaDB
+  ## 
+  elsif (lc $conf->{url_shortener_cache_type} eq 'dbi'
+      && $conf->{url_shortener_cache_dsn} =~ /^dbi:(?:mysql|MariaDB)/)
+  {
+    eval {
+      local $SIG{'__DIE__'};
+      require DBI;
+      $self->{dbh} = DBI->connect_cached(
+        $conf->{url_shortener_cache_dsn},
+        $conf->{url_shortener_cache_username},
+        $conf->{url_shortener_cache_password},
+        {RaiseError => 1, PrintError => 0, InactiveDestroy => 1, AutoCommit => 1}
+      );
+      $self->{sth_insert} = $self->{dbh}->prepare("
+        INSERT INTO short_url_cache (short_url, decoded_url, modified)
+        VALUES (?,?,NOW())
+        ON DUPLICATE KEY UPDATE
+          decoded_url = VALUES(decoded_url),
+          modified = VALUES(modified),
+          hits = hits + 1
+      ");
+      $self->{sth_select} = $self->{dbh}->prepare("
+        SELECT decoded_url FROM short_url_cache
+        WHERE short_url = ? AND modified >= DATE_SUB(NOW(), INTERVAL $conf->{url_shortener_cache_ttl} SECOND)
+      ");
+      $self->{sth_delete} = $self->{dbh}->prepare("
+        DELETE FROM short_url_cache
+        WHERE modified < DATE_SUB(NOW(), INTERVAL $conf->{url_shortener_cache_ttl} SECOND)
+      ");
+    };
+    if ($@) {
+      warn "DecodeShortURLs: cache connect failed: $@\n";
+      undef $self->{dbh};
+      undef $self->{sth_insert};
+      undef $self->{sth_select};
+      undef $self->{sth_delete};
     }
   }
+  ##
+  ## PostgreSQL
+  ## 
+  elsif (lc $conf->{url_shortener_cache_type} eq 'dbi'
+      && $conf->{url_shortener_cache_dsn} =~ /^dbi:Pg/)
+  {
+    eval {
+      local $SIG{'__DIE__'};
+      require DBI;
+      $self->{dbh} = DBI->connect_cached(
+        $conf->{url_shortener_cache_dsn},
+        $conf->{url_shortener_cache_username},
+        $conf->{url_shortener_cache_password},
+        {RaiseError => 1, PrintError => 0, InactiveDestroy => 1, AutoCommit => 1}
+      );
+      $self->{sth_insert} = $self->{dbh}->prepare("
+        INSERT INTO short_url_cache (short_url, decoded_url, modified)
+        VALUES (?,?,NOW())
+        ON CONFLICT (short_url) DO UPDATE SET
+          decoded_url = EXCLUDED.decoded_url,
+          modified = EXCLUDED.modified,
+          hits = short_url_cache.hits + 1
+      ");
+      $self->{sth_select} = $self->{dbh}->prepare("
+        SELECT decoded_url FROM short_url_cache
+        WHERE short_url = ? AND modified >= NOW() - INTERVAL '$conf->{url_shortener_cache_ttl} SECONDS'
+      ");
+      $self->{sth_delete} = $self->{dbh}->prepare("
+        DELETE FROM short_url_cache
+        WHERE modified < NOW() - INTERVAL '$conf->{url_shortener_cache_ttl} SECONDS'
+      ");
+    };
+    if ($@) {
+      warn "DecodeShortURLs: cache connect failed: $@\n";
+      undef $self->{dbh};
+      undef $self->{sth_insert};
+      undef $self->{sth_select};
+      undef $self->{sth_delete};
+    }
+  ##
+  ## ...
+  ##
+  } else {
+    warn "DecodeShortURLs: invalid cache configuration\n";
+    return;
+  }
 }
 
 sub short_url {
@@ -381,239 +521,201 @@ sub short_url_loop {
   return $pms->{short_url_loop};
 }
 
+sub _check_shortener_uri {
+  my ($uri, $conf) = @_;
+
+  return 0 unless $uri =~ m{^
+    https?://		# Only http
+    (?:[^\@/?#]*\@)?	# Ignore user:pass@
+    ([^/?#:]+)		# (Capture hostname)
+    (?::\d+)?		# Possible port
+    .*?\w		# Some path wanted
+    }ix;
+  my $host = lc $1;
+  if (exists $conf->{url_shorteners}->{$host}) {
+    return 1;
+  }
+  # if domain is a 3rd level domain check if there is a url shortener
+  # on the 2nd level tld
+  elsif ($host =~ /^(?!www)[^.]+(\.[^.]+\.[^.]+)$/i &&
+           exists $conf->{url_shorteners}->{$1}) {
+    return 1;
+  }
+  return 0;
+}
+
 sub check_dnsbl {
   my ($self, $opts) = @_;
-  my $pms = $opts->{permsgstatus};
-  my $msg = $opts->{msg};
 
   return if $self->{disabled};
 
-  # don't keep dereferencing these
-  $self->{url_shorteners} = $pms->{main}->{conf}->{url_shorteners};
-  if(defined $pms->{main}->{conf}->{url_shortener_cache_type}) {
-    $self->{url_shortener_cache_type} = $pms->{main}->{conf}->{url_shortener_cache_type};
-    $self->{url_shortener_cache_dsn} = $pms->{main}->{conf}->{url_shortener_cache_dsn};
-    $self->{url_shortener_cache_username} = $pms->{main}->{conf}->{url_shortener_cache_username};
-    $self->{url_shortener_cache_password} = $pms->{main}->{conf}->{url_shortener_cache_password};
-    $self->{url_shortener_cache_ttl} = $pms->{main}->{conf}->{url_shortener_cache_ttl};
-  }
-  $self->{url_shortener_loginfo} = $pms->{main}->{conf}->{url_shortener_loginfo};
+  my $pms = $opts->{permsgstatus};
+  my $conf = $pms->{conf};
 
   # Sort short URLs into hash to de-dup them
   my %short_urls;
   my $uris = $pms->get_uri_detail_list();
-  my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
   while (my($uri, $info) = each %{$uris}) {
-    next unless ($info->{domains});
-    foreach ( keys %{ $info->{domains} } ) {
-      if (exists $self->{url_shorteners}->{lc $_}) {
-        # NOTE: $info->{domains} appears to contain all the domains parsed
-        # from the single input URI with no way to work out what the base
-        # domain is.  So to prevent someone from stuffing the URI with a
-        # shortener to force this plug-in to follow a link that *isn't* on
-        # the list of shorteners; we enforce that the shortener must be the
-        # base URI and that a path must be present.
-        if ($uri !~ /^https?:\/\/(?:www\.)?$_\/.+$/i) {
-          dbg("Discarding URI: $uri");
-          next;
-        }
-        $short_urls{$uri} = 1;
-        next;
-      } elsif(/^(?!www)[a-z\d._-]{0,251}\.([a-z\d._-]{0,251}\.${tldsRE})/) {
-        # if domain is a 3rd level domain check if there is a url shortener
-        # on the 2nd level tld
-        my $dom = '.' . $1;
-        if (exists $self->{url_shorteners}->{$dom}) {
-          if ($uri !~ /^https?:\/\/(?:www\.)?$_\/.+$/i) {
-            dbg("Discarding URI: $uri");
-            next;
-          }
-          $short_urls{$uri} = 1;
-          next;
-        }
-      }
+    next unless $info->{domains} && $info->{cleaned};
+    if (_check_shortener_uri($uri, $conf)) {
+      $short_urls{$uri} = 1;
     }
   }
 
   # Make sure we have some work to do
   # Before we open any log files etc.
-  my $count = scalar keys %short_urls;
-  return unless $count gt 0;
+  return unless %short_urls;
+
+  # Initialize cache
+  $self->initialise_url_shortener_cache($conf);
 
-  $self->initialise_url_shortener_cache($opts) if defined $self->{url_shortener_cache_type};
+  # Initialize LWP
+  my $ua = LWP::UserAgent->new();
+  $ua->{max_redirect} = 0;
+  $ua->{timeout} = 5;
+  $ua->env_proxy;
 
-  my $max_short_urls = $pms->{main}->{conf}->{max_short_urls};
+  # Launch HTTP queries
+  my $lookups = 0;
   foreach my $short_url (keys %short_urls) {
-    next if $max_short_urls <= 0;
-    my $location = $self->recursive_lookup($short_url, $pms);
-    $max_short_urls--;
+    $self->recursive_lookup($short_url, $pms, $ua);
+    last if ++$lookups >= $conf->{max_short_urls};
+  }
+
+  # Automatically purge old entries
+  if ($self->{dbh} && $conf->{url_shortener_cache_autoclean}
+      && rand() < 1/$conf->{url_shortener_cache_autoclean})
+  {
+    dbg("cleaning stale cache entries");
+    eval { $self->{sth_delete}->execute(); };
+    if ($@) { dbg("cache cleaning failed: $@"); }
   }
 }
 
 sub recursive_lookup {
-  my ($self, $short_url, $pms, %been_here) = @_;
+  my ($self, $short_url, $pms, $ua, %been_here) = @_;
+  my $conf = $pms->{conf};
 
   my $count = scalar keys %been_here;
-  dbg("Redirection count $count") if $count gt 0;
+  dbg("redirection count $count") if $count;
   if ($count >= 10) {
-    dbg("Error: more than 10 shortener redirections");
+    dbg("found more than 10 shortener redirections");
     # Fire test
-    $self->{short_url_maxchain} = 1;
+    $pms->{short_url_maxchain} = 1;
     return;
   }
 
   my $location;
-  if ($self->{caching} && ($location = $self->cache_get($short_url))) {
-    if ($self->{url_shortener_loginfo}) {
-      info("Found cached $short_url => $location");
+  if (defined($location = $self->cache_get($short_url))) {
+    if ($conf->{url_shortener_loginfo}) {
+      info("found cached $short_url => $location");
     } else {
-      dbg("Found cached $short_url => $location");
+      dbg("found cached $short_url => $location");
     }
   } else {
     # Not cached; do lookup
-    if($count eq 0) {
-      undef $pms->{short_url_200};
-      undef $pms->{short_url_404};
-      undef $pms->{short_url_chained};
-    }
-    my $response = $self->{ua}->head($short_url);
+    my $response = $ua->head($short_url);
     if (!$response->is_redirect) {
       dbg("URL is not redirect: $short_url = ".$response->status_line);
-      $pms->{short_url_200} = 1 if($response->code == '200');
-      $pms->{short_url_404} = 1 if($response->code == '404');
+      if ($response->code eq '200') {
+        $pms->{short_url_200} = 1;
+      } elsif ($response->code eq '404') {
+        $pms->{short_url_404} = 1;
+      }
       return;
     }
     $location = $response->headers->{location};
-    # Bail out if $short_url redirects to itself
-    return if ($short_url eq $location);
-    if ($self->{caching}) {
-      if ($self->cache_add($short_url, $location)) {
-        dbg("Added $short_url to cache");
-      } else {
-        dbg("Cannot add $short_url to cache");
-      }
-    }
-    if($self->{url_shortener_loginfo}) {
-      info("Found $short_url => $location");
+    if ($self->{url_shortener_loginfo}) {
+      info("found $short_url => $location");
     } else {
-      dbg("Found $short_url => $location");
+      dbg("found $short_url => $location");
     }
   }
 
+  # Update cache
+  $self->cache_add($short_url, $location);
+
+  # Bail out if $short_url redirects to itself
+  if ($short_url eq $location) {
+    dbg("URL is redirect to itself");
+    return;
+  }
+
   # At this point we have a new URL in $response
   $pms->{short_url} = 1;
 
   # Set chained here otherwise we might mark a disabled page or
   # redirect back to the same host as chaining incorrectly.
-  $pms->{short_url_chained} = 1 if $count > 0;
-
-  $pms->add_uri_detail_list($location);
+  $pms->{short_url_chained} = 1 if $count;
 
   # Check if we are being redirected to a local page
   # Don't recurse in this case...
-  if($location !~ /^https?:/) {
-    my($host) = ($short_url =~ /^(https?:\/\/\S+)\//);
-    $location = "$host/$location";
-    dbg("Looks like a local redirection: $short_url => $location");
-    $pms->add_uri_detail_list($location);
-    return $location;
-  }
-
-  # Check for recursion
-  if ((my ($domain) = ($location =~ /^https?:\/\/(\S+)\//))) {
-    if (exists $been_here{$location}) {
-      # Loop detected
-      dbg("Error: loop detected");
-      $self->{short_url_loop} = 1;
-      return $location;
+  if ($location !~ m{^[a-z]+://}i) {
+    my $orig_location = $location;
+    my $orig_short_url = $short_url;
+    # Strip to..
+    if (index($location, '/') == 0) {
+      $short_url =~ s{^([a-z]+://.*?)[/?#].*}{$1}; # ..absolute path
     } else {
-      my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
-      if (exists $self->{url_shorteners}->{$domain}) {
-        $been_here{$location} = 1;
-        # Recurse...
-        return $self->recursive_lookup($location, $pms, %been_here);
-      } elsif($domain =~ /^(?!www)[a-z\d._-]{0,251}\.([a-z\d._-]{0,251}\.${tldsRE})/) {
-        # if domain is a 3rd level domain check if there is a url shortener
-        # on the 2nd level tld
-        my $dom = '.' . $1;
-        if (exists $self->{url_shorteners}->{$dom}) {
-          $been_here{$location} = 1;
-          # Recurse...
-          return $self->recursive_lookup($location, $pms, %been_here);
-        }
-      }
+      $short_url =~ s{^([a-z]+://.*)/}{$1}; # ..relative path
     }
+    $location = "$short_url/$location";
+    dbg("looks like a local redirection: $orig_short_url => $location ($orig_location)");
+    $pms->add_uri_detail_list($location) if !$pms->{uri_detail_list}->{$location};
+    return;
   }
 
-  # No recursion; just return the final location...
-  return $location;
+  if (exists $been_here{$location}) {
+    # Loop detected
+    dbg("error: loop detected: $location");
+    $pms->{short_url_loop} = 1;
+    return;
+  }
+  $been_here{$location} = 1;
+  $pms->add_uri_detail_list($location) if !$pms->{uri_detail_list}->{$location};
+
+  # Check for recursion
+  if (_check_shortener_uri($location, $conf)) {
+    # Recurse...
+    $self->recursive_lookup($location, $pms, $ua, %been_here);
+  }
 }
 
 sub cache_add {
   my ($self, $short_url, $decoded_url) = @_;
-  return 0 if not $self->{caching};
 
-  return 0 if((length($short_url) > 256) or (length($decoded_url) > 512));
+  return if !$self->{dbh};
+  return if length($short_url) > 256 || length($decoded_url) > 512;
 
-  eval {
-    $self->{sth_insert} = $self->{dbh}->prepare_cached("
-      INSERT INTO short_url_cache (short_url, decoded_url, created, modified)
-      VALUES (?,?,?,?)
-    ");
-  };
+  eval { $self->{sth_insert}->execute($short_url, $decoded_url); };
   if ($@) {
-    dbg("warn: $@");
-    return 0;
-  };
+    dbg("could not add to cache: $@");
+  }
 
-  $self->{sth_insert}->execute($short_url, $decoded_url, time(), time());
-  return 1;
+  return;
 }
 
 sub cache_get {
   my ($self, $key) = @_;
-  return if not $self->{caching};
 
-  eval {
-    $self->{sth_select} = $self->{dbh}->prepare_cached("
-      SELECT decoded_url FROM short_url_cache
-      WHERE short_url = ? AND modified > ?
-    ");
-  };
-  if ($@) {
-   dbg("warn: $@");
-   return;
-  }
+  return if !$self->{dbh};
 
-  eval {
-    $self->{sth_update} = $self->{dbh}->prepare_cached("
-      UPDATE short_url_cache
-      SET modified=?, hits=hits+1
-      WHERE short_url = ?
-    ");
-  };
+  eval { $self->{sth_select}->execute($key); };
   if ($@) {
-   dbg("warn: $@");
-   return;
+    dbg("cache get failed: $@");
+    return;
   }
 
-  my $tcheck = time() - $self->{url_shortener_cache_ttl};
-  $self->{sth_select}->execute($key, $tcheck);
-  my $row = $self->{sth_select}->fetchrow_array();
-  if($row) {
-    # Found cache entry; touch it to prevent expiry
-    $self->{sth_update}->execute(time(),$key);
-    $self->{sth_select}->finish();
-    $self->{sth_update}->finish();
-    return $row;
+  my @row = $self->{sth_select}->fetchrow_array();
+  if (@row) {
+    return $row[0];
   }
 
-  $self->{sth_select}->finish();
-  $self->{sth_update}->finish();
   return;
 }
 
 # Version features
 sub has_short_url { 1 }
+sub has_autoclean { 1 }
 
 1;

Modified: spamassassin/trunk/sql/decodeshorturl_mysql.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_mysql.sql?rev=1900462&r1=1900461&r2=1900462&view=diff
==============================================================================
--- spamassassin/trunk/sql/decodeshorturl_mysql.sql (original)
+++ spamassassin/trunk/sql/decodeshorturl_mysql.sql Sun May  1 17:51:21 2022
@@ -1,10 +1,10 @@
 CREATE TABLE `short_url_cache`
-( `short_url` VARCHAR(256) NOT NULL ,
-  `decoded_url` VARCHAR(512) NOT NULL ,
-  `hits` MEDIUMINT NOT NULL DEFAULT 1,
-  `created` INT(11) NOT NULL ,
-  `modified` INT(11) NOT NULL ,
+( `short_url` VARCHAR(256) NOT NULL,
+  `decoded_url` VARCHAR(512) NOT NULL,
+  `hits` INT NOT NULL DEFAULT 1,
+  `created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  `modified` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
   PRIMARY KEY (`short_url`)
 ) ENGINE = InnoDB;
-ALTER TABLE `short_url_cache` ADD INDEX `short_url_by_modified` (`short_url`, `modified`);
-ALTER TABLE `short_url_cache` ADD INDEX `short_url_modified` (`modified`);
+-- Maintaining index for cleaning is likely more expensive than occasional full table scan
+-- ALTER TABLE `short_url_cache` ADD INDEX `short_url_modified` (`modified`);

Added: spamassassin/trunk/sql/decodeshorturl_pg.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_pg.sql?rev=1900462&view=auto
==============================================================================
--- spamassassin/trunk/sql/decodeshorturl_pg.sql (added)
+++ spamassassin/trunk/sql/decodeshorturl_pg.sql Sun May  1 17:51:21 2022
@@ -0,0 +1,10 @@
+CREATE TABLE short_url_cache (
+  short_url VARCHAR(256) NOT NULL,
+  decoded_url VARCHAR(512) NOT NULL,
+  hits INT NOT NULL DEFAULT 1,
+  created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  modified TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (short_url)
+);
+-- Maintaining index for cleaning is likely more expensive than occasional full table scan
+-- ALTER TABLE short_url_cache ADD INDEX short_url_modified (modified);

Modified: spamassassin/trunk/sql/decodeshorturl_sqlite.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_sqlite.sql?rev=1900462&r1=1900461&r2=1900462&view=diff
==============================================================================
--- spamassassin/trunk/sql/decodeshorturl_sqlite.sql (original)
+++ spamassassin/trunk/sql/decodeshorturl_sqlite.sql Sun May  1 17:51:21 2022
@@ -1,11 +1,2 @@
-CREATE TABLE short_url_cache (
-  short_url   TEXT PRIMARY KEY NOT NULL,
-  decoded_url TEXT NOT NULL,
-  hits        INTEGER NOT NULL DEFAULT 1,
-  created     INTEGER NOT NULL ,
-  modified    INTEGER NOT NULL
-);
-CREATE INDEX short_url_by_modified
-  ON short_url_cache(short_url, modified);
-CREATE INDEX short_url_modified
-  ON short_url_cache(modified);
+-- Manual database creation for SQLite is not necessary,
+-- DecodeShortURLs plugin will create and clean database automatically.

Modified: spamassassin/trunk/t/data/spam/decodeshorturl/base.eml
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/data/spam/decodeshorturl/base.eml?rev=1900462&r1=1900461&r2=1900462&view=diff
==============================================================================
--- spamassassin/trunk/t/data/spam/decodeshorturl/base.eml (original)
+++ spamassassin/trunk/t/data/spam/decodeshorturl/base.eml Sun May  1 17:51:21 2022
@@ -11,3 +11,7 @@ http://bit.ly/30yH6WK
 which should link to:
 
 https://spamassassin.apache.org/
+
+should get 404:
+http://tinyurl.com/qqqxxxyyyzzz
+

Modified: spamassassin/trunk/t/decodeshorturl.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/decodeshorturl.t?rev=1900462&r1=1900461&r2=1900462&view=diff
==============================================================================
--- spamassassin/trunk/t/decodeshorturl.t (original)
+++ spamassassin/trunk/t/decodeshorturl.t Sun May  1 17:51:21 2022
@@ -6,7 +6,7 @@ use SATest; sa_t_init("decodeshorturl");
 use Test::More;
 
 plan skip_all => "Net tests disabled"                unless conf_bool('run_net_tests');
-plan tests => 2;
+plan tests => 3;
 
 tstpre ("
 loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
@@ -19,21 +19,23 @@ dns_query_restriction allow tinyurl.com
 url_shortener bit.ly
 url_shortener tinyurl.com
 
-ifplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
-  body HAS_SHORT_URL              eval:short_url()
-  describe HAS_SHORT_URL          Message contains one or more shortened URLs
-
-  body SHORT_URL_CHAINED          eval:short_url_chained()
-  describe SHORT_URL_CHAINED      Message has shortened URL chained to other shorteners
-endif
+body HAS_SHORT_URL              eval:short_url()
+describe HAS_SHORT_URL          Message contains one or more shortened URLs
+
+body SHORT_URL_CHAINED          eval:short_url_chained()
+describe SHORT_URL_CHAINED      Message has shortened URL chained to other shorteners
+
+body SHORT_URL_404		eval:short_url_404()
+describe SHORT_URL_404		Short URL is invalid
 ");
 
 %patterns_url = (
-   q{ HAS_SHORT_URL } => 'Message contains one or more shortened URLs',
+   q{ 1.0 HAS_SHORT_URL } => 'Message contains one or more shortened URLs',
+   q{ 1.0 HAS_SHORT_404 } => 'Message contains one or more shortened URLs',
             );
 
 %patterns_url_chain = (
-   q{ SHORT_URL_CHAINED } => 'Message has shortened URL chained to other shorteners',
+   q{ 1.0 SHORT_URL_CHAINED } => 'Message has shortened URL chained to other shorteners',
             );
 
 %patterns = %patterns_url;