You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by gb...@apache.org on 2021/05/27 17:35:21 UTC

svn commit: r1890251 - in /spamassassin/trunk: ./ lib/Mail/SpamAssassin/Plugin/ rules/ sql/ t/ t/data/spam/decodeshorturl/

Author: gbechis
Date: Thu May 27 17:35:21 2021
New Revision: 1890251

URL: http://svn.apache.org/viewvc?rev=1890251&view=rev
Log:
Add DecodeShortURLs plugin to check urls hidden behind url shorteners

Added:
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
    spamassassin/trunk/sql/decodeshorturl_mysql.sql
    spamassassin/trunk/sql/decodeshorturl_sqlite.sql
    spamassassin/trunk/t/data/spam/decodeshorturl/
    spamassassin/trunk/t/data/spam/decodeshorturl/base.eml
    spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml
    spamassassin/trunk/t/decodeshorturl.t   (with props)
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/rules/v400.pre

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1890251&r1=1890250&r2=1890251&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Thu May 27 17:35:21 2021
@@ -82,6 +82,7 @@ lib/Mail/SpamAssassin/Plugin/BodyEval.pm
 lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
 lib/Mail/SpamAssassin/Plugin/Check.pm
 lib/Mail/SpamAssassin/Plugin/DCC.pm
+lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
 lib/Mail/SpamAssassin/Plugin/DKIM.pm
 lib/Mail/SpamAssassin/Plugin/DNSEval.pm
 lib/Mail/SpamAssassin/Plugin/Esp.pm
@@ -227,6 +228,8 @@ sql/awl_mysql.sql
 sql/awl_pg.sql
 sql/bayes_mysql.sql
 sql/bayes_pg.sql
+sql/decodeshorturl_mysql.sql
+sql/decodeshorturl_sqlite.sql
 sql/userpref_mysql.sql
 sql/userpref_pg.sql
 sql/txrep_mysql.sql
@@ -405,6 +408,8 @@ t/data/spam/badmime3.txt
 t/data/spam/base64.txt
 t/data/spam/bsmtp
 t/data/spam/bsmtpnull
+t/data/spam/decodeshorturl/base.eml
+t/data/spam/decodeshorturl/chain.eml
 t/data/spam/dnsbl.eml
 t/data/spam/dnsbl_domsonly.eml
 t/data/spam/dnsbl_ipsonly.eml
@@ -473,6 +478,7 @@ t/db_based_whitelist.t
 t/db_based_whitelist_ips.t
 t/dcc.t
 t/debug.t
+t/decodeshorturl.t
 t/desc_wrap.t
 t/dkim.t
 t/dnsbl.t

Added: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm?rev=1890251&view=auto
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm Thu May 27 17:35:21 2021
@@ -0,0 +1,526 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+# Original code by Steve Freegard <st...@fsl.com>
+
+=head1 NAME
+
+DecodeShortURLs - Expand shortened URLs
+
+=head1 SYNOPSIS
+
+  loadplugin    Mail::SpamAssassin::Plugin::DecodeShortURLs
+
+  url_shortener bit.ly
+  url_shortener go.to
+  ...
+
+=head1 DESCRIPTION
+
+This plugin looks for URLs shortened by a list of URL shortening services and
+upon finding a matching URL will connect using to the shortening service and
+do an HTTP HEAD lookup and retrieve the location header which points to the
+actual shortened URL, it then adds this URL to the list of URIs extracted by
+SpamAssassin which can then be accessed by other plug-ins, such as URIDNSBL.
+
+This plugin also sets the rule HAS_SHORT_URL if any matching short URLs are
+found.
+
+Regular 'uri' rules can be used to detect and score links disabled by the
+shortening service for abuse and URL_BITLY_BLOCKED is supplied as an example.
+It should be safe to score this rule highly on a match as experience shows
+that bit.ly only blocks access to a URL if it has seen consistent abuse and
+problem reports.
+
+This plug-in will follow 'chained' shorteners e.g.
+from short URL to short URL to short URL and finally to the real URL
+
+
+If this form of chaining is found, then the rule 'SHORT_URL_CHAINED' will be
+fired.  If a loop is detected then 'SHORT_URL_LOOP' will be fired.
+This plug-in limits the number of chained shorteners to a maximim of 10 at
+which point it will fire the rule 'SHORT_URL_MAXCHAIN' and go no further.
+
+If a shortener returns a '404 Not Found' result for the short URL then the
+rule 'SHORT_URL_404' will be fired.
+
+If a shortener does not return an HTTP redirect, then a dynamic rule will
+be fired: 'SHORT_C<SHORTENER>_C<CODE>' where C<SHORTENER> is the uppercase
+name of the shortener with dots converted to underscores.  e.g.:
+'SHORT_T_CO_200' This is to handle the case of t.co which now returns an
+HTTP 200 and an abuse page instead of redirecting to an abuse page like
+every other shortener does...
+
+=head1 NOTES
+
+This plugin runs the parsed_metadata hook with a priority of -1 so that
+it may modify the parsed URI list prior to the URIDNSBL plugin which
+runs as priority 0.
+
+Currently the plugin queries a maximum of 10 distinct shortened URLs with
+a maximum timeout of 5 seconds per lookup.
+
+=head1 ACKNOWLEDGEMENTS
+
+A lot of this plugin has been hacked together by using other plugins as
+examples.  The author would particularly like to tip his hat to Karsten
+Bräckelmann for his work on GUDO.pm, the original version of this plugin
+could not have been developed without his code.
+
+=cut
+
+package Mail::SpamAssassin::Plugin::DecodeShortURLs;
+
+my $VERSION = 0.11;
+
+use Mail::SpamAssassin::Plugin;
+use strict;
+use warnings;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+use constant HAS_LWP_USERAGENT => eval { local $SIG{'__DIE__'}; require LWP::UserAgent; };
+use constant HAS_DBI => eval { local $SIG{'__DIE__'}; require DBI; };
+
+sub dbg {
+  my $msg = shift;
+  return Mail::SpamAssassin::Logger::dbg("DecodeShortURLs: $msg");
+}
+
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  if ($mailsaobject->{local_tests_only} || !HAS_LWP_USERAGENT) {
+    $self->{disabled} = 1;
+  } else {
+    $self->{disabled} = 0;
+  }
+
+  unless ($self->{disabled}) {
+    $self->{ua} = new LWP::UserAgent;
+    $self->{ua}->{max_redirect} = 0;
+    $self->{ua}->{timeout} = 5;
+    $self->{ua}->env_proxy;
+    $self->{caching} = 0;
+  }
+
+  $self->set_config($mailsaobject->{conf});
+  $self->register_method_priority ('parsed_metadata', -1);
+  $self->register_eval_rule('short_url_tests');
+
+  return $self;
+}
+
+sub set_config {
+  my($self, $conf) = @_;
+  my @cmds = ();
+
+  push (@cmds, {
+    setting => 'url_shortener',
+    default => {},
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+      if ($value =~ /^$/) {
+        return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+      }
+      foreach my $domain (split(/\s+/, $value)) {
+        $self->{url_shorteners}->{lc $domain} = 1;
+      }
+    }
+  });
+
+=cut
+
+=head1 PRIVILEGED SETTINGS
+
+=over 4
+
+=item url_shortener_cache_type     (default: none)
+
+The specific type of cache type that is being utilized. Currently only sqlite
+is configured however plans to support redis cache is planned.
+
+Example:
+url_shortener_cache_type sqlite
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_cache_type',
+    default => undef,
+    is_priv => 1,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
+  });
+
+=cut
+
+=item url_shortener_cache_dsn		(default: none)
+
+The dsn to a database file to write cache entries to.  The database will
+be created automatically if is does not already exist but the supplied path
+and file must be read/writable by the user running spamassassin or spamd.
+
+Note: You will need to have the proper DBI version of the cache type installed.
+
+Example:
+
+url_shortener_cache_dsn dbi:SQLite:dbname=/tmp/DecodeShortURLs.sq3
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_cache_dsn',
+    default => '',
+    is_priv => 1,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
+  });
+
+=cut
+
+=item url_shortener_cache_username  (default: none)
+
+The username that should be used to connect to the database.
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_cache_username',
+    default => '',
+    is_priv => 1,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
+  });
+
+=cut
+
+=item url_shortener_cache_password  (default: none)
+
+The password that should be used to connect to the database.
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_cache_password',
+    default => '',
+    is_priv => 1,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
+  });
+
+=cut
+
+=item url_shortener_cache_ttl		(default: 86400)
+
+The length of time a cache entry will be valid for in seconds.
+Default is 86400 (1 day).
+
+NOTE: you will also need to run the following via cron to actually remove the
+records from the database:
+
+echo "DELETE FROM short_url_cache WHERE modified < NOW() - C<ttl>; | sqlite3 /path/to/database"
+
+NOTE: replace C<ttl> above with the same value you use for this option
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_cache_ttl',
+    is_admin => 1,
+    default => 86400,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+  });
+
+=cut
+
+=item url_shortener_loginfo           (default: 0 (off))
+
+If this option is enabled (set to 1), then short URLs and the decoded URLs will be logged with info priority.
+
+=cut
+
+  push (@cmds, {
+    setting => 'url_shortener_loginfo',
+    is_admin => 1,
+    default => 0,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
+  });
+
+=cut
+
+=item max_short_urls                 (default: 10)
+
+The max depth of short urls that will be chained until it stops looking further.
+
+=cut
+
+  push (@cmds, {
+    setting => 'max_short_urls',
+    is_admin => 1,
+    default => 10,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+  });
+
+  $conf->{parser}->register_commands(\@cmds);
+}
+
+sub initialise_url_shortener_cache {
+  my($self, $opts) = @_;
+
+  if($self->{url_shortener_cache_type} eq "dbi" && defined $self->{url_shortener_cache_dsn} && HAS_DBI) {
+    $self->{url_shortener_dbi_cache} = $self->{url_shortener_cache_dsn};
+    return _connect_dbi_cache($self, $opts);
+  } else {
+    warn "Wrong cache type selected";
+    return undef;
+  }
+}
+
+sub _connect_dbi_cache {
+  my($self, $opts) = @_;
+
+  # Initialise cache if enabled
+  if ($self->{url_shortener_dbi_cache} && HAS_DBI) {
+    eval {
+      local $SIG{'__DIE__'};
+      $self->{dbh} = DBI->connect_cached($self->{url_shortener_cache_dsn},$self->{url_shortener_cache_username},$self->{url_shortener_cache_password}, {RaiseError => 1, PrintError => 0, InactiveDestroy => 1}) or die $!;
+    };
+    if ($@) {
+      dbg("warn: $@");
+    } else {
+      $self->{caching} = 1;
+    }
+  }
+}
+
+sub parsed_metadata {
+  my ($self, $opts) = @_;
+  my $pms = $opts->{permsgstatus};
+  my $msg = $opts->{msg};
+
+  return if $self->{disabled};
+
+  dbg ('warn: get_uri_detail_list() has been called already')
+    if exists $pms->{uri_detail_list};
+
+  # don't keep dereferencing these
+  $self->{url_shorteners} = $pms->{main}->{conf}->{url_shorteners};
+  if(defined $pms->{main}->{conf}->{url_shortener_cache_type}) {
+    $self->{url_shortener_cache_type} = $pms->{main}->{conf}->{url_shortener_cache_type};
+    $self->{url_shortener_cache_dsn} = $pms->{main}->{conf}->{url_shortener_cache_dsn};
+    $self->{url_shortener_cache_username} = $pms->{main}->{conf}->{url_shortener_cache_username};
+    $self->{url_shortener_cache_password} = $pms->{main}->{conf}->{url_shortener_cache_password};
+    $self->{url_shortener_cache_ttl} = $pms->{main}->{conf}->{url_shortener_cache_ttl};
+  }
+  $self->{url_shortener_loginfo} = $pms->{main}->{conf}->{url_shortener_loginfo};
+
+  # Sort short URLs into hash to de-dup them
+  my %short_urls;
+  my $uris = $pms->get_uri_detail_list();
+  while (my($uri, $info) = each %{$uris}) {
+    next unless ($info->{domains});
+    foreach ( keys %{ $info->{domains} } ) {
+      if (exists $self->{url_shorteners}->{lc $_}) {
+        # NOTE: $info->{domains} appears to contain all the domains parsed
+        # from the single input URI with no way to work out what the base
+        # domain is.  So to prevent someone from stuffing the URI with a
+        # shortener to force this plug-in to follow a link that *isn't* on
+        # the list of shorteners; we enforce that the shortener must be the
+        # base URI and that a path must be present.
+        if ($uri !~ /^https?:\/\/(?:www\.)?$_\/.+$/i) {
+          dbg("Discarding URI: $uri");
+          next;
+        }
+        $short_urls{$uri} = 1;
+        next;
+      }
+    }
+  }
+
+  # Make sure we have some work to do
+  # Before we open any log files etc.
+  my $count = scalar keys %short_urls;
+  return undef unless $count gt 0;
+
+  initialise_url_shortener_cache($self, $opts) if defined $self->{url_shortener_cache_type};
+
+  my $max_short_urls = $pms->{main}->{conf}->{max_short_urls};
+  foreach my $short_url (keys %short_urls) {
+    next if ($max_short_urls le 0);
+    my $location = $self->recursive_lookup($short_url, $pms);
+    $max_short_urls--;
+  }
+}
+
+sub recursive_lookup {
+  my ($self, $short_url, $pms, %been_here) = @_;
+
+  my $count = scalar keys %been_here;
+  dbg("Redirection count $count") if $count gt 0;
+  if ($count >= 10) {
+    dbg("Error: more than 10 shortener redirections");
+    # Fire test
+    $pms->got_hit('SHORT_URL_MAXCHAIN');
+    return undef;
+  }
+
+  my $location;
+  if ($self->{caching} && ($location = $self->cache_get($short_url))) {
+    dbg("Found cached $short_url => $location");
+    if($self->{url_shortner_loginfo}) {
+      info("Found cached $short_url => $location");
+    } else {
+      dbg("Found cached $short_url => $location");
+    }
+  } else {
+    # Not cached; do lookup
+    my $response = $self->{ua}->head($short_url);
+    if (!$response->is_redirect) {
+      dbg("URL is not redirect: $short_url = ".$response->status_line);
+      if ((my ($domain) = ($short_url =~ /^https?:\/\/(\S+)\//))) {
+          if (exists $self->{url_shorteners}->{$domain}) {
+              $domain =~ s/\./_/g;
+              $domain = uc($domain);
+              my $h = 'SHORT_' . $domain . '_' . $response->code;
+              dbg("hit rule: $h");
+              $pms->got_hit($h);
+          }
+      }
+      $pms->got_hit('SHORT_URL_404') if($response->code == '404');
+      return undef;
+    }
+    $location = $response->headers->{location};
+    # Bail out if $short_url redirects to itself
+    return undef if ($short_url eq $location);
+    $self->cache_add($short_url, $location) if $self->{caching};
+    dbg("Found $short_url => $location");
+    if($self->{url_shortener_loginfo}) {
+      info("Found $short_url => $location");
+    } else {
+      dbg("Found $short_url => $location");
+    }
+  }
+
+  # At this point we have a new URL in $response
+  $pms->got_hit('HAS_SHORT_URL');
+  $pms->add_uri_detail_list($location);
+
+  # Set chained here otherwise we might mark a disabled page or
+  # redirect back to the same host as chaining incorrectly.
+  $pms->got_hit('SHORT_URL_CHAINED') if ($count gt 0);
+
+  # Check if we are being redirected to a local page
+  # Don't recurse in this case...
+  if($location !~ /^https?:/) {
+    my($host) = ($short_url =~ /^(https?:\/\/\S+)\//);
+    $location = "$host/$location";
+    dbg("Looks like a local redirection: $short_url => $location");
+    $pms->add_uri_detail_list($location);
+    return $location;
+  }
+
+  # Check for recursion
+  if ((my ($domain) = ($location =~ /^https?:\/\/(\S+)\//))) {
+    if (exists $been_here{$location}) {
+      # Loop detected
+      dbg("Error: loop detected");
+      $pms->got_hit('SHORT_URL_LOOP');
+      return $location;
+    } else {
+      if (exists $self->{url_shorteners}->{$domain}) {
+        $been_here{$location} = 1;
+        # Recurse...
+        return $self->recursive_lookup($location, $pms, %been_here);
+      }
+    }
+  }
+
+  # No recursion; just return the final location...
+  return $location;
+}
+
+sub short_url_tests {
+  # Set by parsed_metadata
+  return 0;
+}
+
+sub cache_add {
+  my ($self, $short_url, $decoded_url) = @_;
+  return undef if not $self->{caching};
+
+  eval {
+    $self->{sth_insert} = $self->{dbh}->prepare_cached("
+      INSERT INTO short_url_cache (short_url, decoded_url, created, modified)
+      VALUES (?,?,?,?)
+    ");
+  };
+  if ($@) {
+    dbg("warn: $@");
+    return undef;
+  };
+
+  $self->{sth_insert}->execute($short_url, $decoded_url, time(), time());
+  return undef;
+}
+
+sub cache_get {
+  my ($self, $key) = @_;
+  return undef if not $self->{caching};
+
+  eval {
+    $self->{sth_select} = $self->{dbh}->prepare_cached("
+      SELECT decoded_url FROM short_url_cache
+      WHERE short_url = ? AND modified > ?
+    ");
+  };
+  if ($@) {
+   dbg("warn: $@");
+   return undef;
+  }
+
+  eval {
+    $self->{sth_update} = $self->{dbh}->prepare_cached("
+      UPDATE short_url_cache
+      SET modified=?, hits=hits+1
+      WHERE short_url = ?
+    ");
+  };
+  if ($@) {
+   dbg("warn: $@");
+   return undef;
+  }
+
+  my $tcheck = time() - $self->{url_shortener_cache_ttl};
+  $self->{sth_select}->execute($key, $tcheck);
+  my $row = $self->{sth_select}->fetchrow_array();
+  if($row) {
+    # Found cache entry; touch it to prevent expiry
+    $self->{sth_update}->execute(time(),$key);
+    $self->{sth_select}->finish();
+    $self->{sth_update}->finish();
+    return $row;
+  }
+
+  $self->{sth_select}->finish();
+  $self->{sth_update}->finish();
+  return undef;
+}
+
+1;

Modified: spamassassin/trunk/rules/v400.pre
URL: http://svn.apache.org/viewvc/spamassassin/trunk/rules/v400.pre?rev=1890251&r1=1890250&r2=1890251&view=diff
==============================================================================
--- spamassassin/trunk/rules/v400.pre (original)
+++ spamassassin/trunk/rules/v400.pre Thu May 27 17:35:21 2021
@@ -29,3 +29,7 @@
 # into SpamAssassin
 #
 # loadplugin Mail::SpamAssassin::Plugin::ExtractText
+
+# DecodeShortUrl - Decode url from url shorteners
+#
+# loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs

Added: spamassassin/trunk/sql/decodeshorturl_mysql.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_mysql.sql?rev=1890251&view=auto
==============================================================================
--- spamassassin/trunk/sql/decodeshorturl_mysql.sql (added)
+++ spamassassin/trunk/sql/decodeshorturl_mysql.sql Thu May 27 17:35:21 2021
@@ -0,0 +1,10 @@
+CREATE TABLE `short_url_cache`
+( `short_url` VARCHAR(256) NOT NULL ,
+  `decoded_url` VARCHAR(512) NOT NULL ,
+  `hits` MEDIUMINT NOT NULL DEFAULT 1,
+  `created` INT(11) NOT NULL ,
+  `modified` INT(11) NOT NULL ,
+  PRIMARY KEY (`short_url`)
+) ENGINE = InnoDB;
+ALTER TABLE `spam_bayes`.`short_url_cache` ADD INDEX `short_url_by_modified` (`short_url`, `modified`);
+ALTER TABLE `spam_bayes`.`short_url_cache` ADD INDEX `short_url_modified` (`modified`);

Added: spamassassin/trunk/sql/decodeshorturl_sqlite.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_sqlite.sql?rev=1890251&view=auto
==============================================================================
--- spamassassin/trunk/sql/decodeshorturl_sqlite.sql (added)
+++ spamassassin/trunk/sql/decodeshorturl_sqlite.sql Thu May 27 17:35:21 2021
@@ -0,0 +1,11 @@
+CREATE TABLE short_url_cache (
+  short_url   TEXT PRIMARY KEY NOT NULL,
+  decoded_url TEXT NOT NULL,
+  hits        INTEGER NOT NULL DEFAULT 1,
+  created     INTEGER NOT NULL ,
+  modified    INTEGER NOT NULL
+);
+CREATE INDEX short_url_by_modified
+  ON short_url_cache(short_url, modified);
+CREATE INDEX short_url_modified
+  ON short_url_cache(modified);

Added: spamassassin/trunk/t/data/spam/decodeshorturl/base.eml
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/data/spam/decodeshorturl/base.eml?rev=1890251&view=auto
==============================================================================
--- spamassassin/trunk/t/data/spam/decodeshorturl/base.eml (added)
+++ spamassassin/trunk/t/data/spam/decodeshorturl/base.eml Thu May 27 17:35:21 2021
@@ -0,0 +1,13 @@
+To: Entity <en...@example.com>
+From: Example <ex...@example.com>
+Subject: This is a test email for a shortened URL
+Message-ID: <ea...@example.com>
+Date: Tue, 10 Nov 2020 13:33:08 -0500
+
+Greetings,
+
+http://bit.ly/30yH6WK
+
+which should link to:
+
+https://spamassassin.apache.org/

Added: spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml?rev=1890251&view=auto
==============================================================================
--- spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml (added)
+++ spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml Thu May 27 17:35:21 2021
@@ -0,0 +1,17 @@
+To: Entity <en...@example.com>
+From: Example <ex...@example.com>
+Subject: This is a test email for a shortened URL
+Message-ID: <ea...@example.com>
+Date: Tue, 10 Nov 2020 13:33:08 -0500
+
+Greetings,
+
+https://tinyurl.com/jf8wv76t
+
+which should link to:
+
+http://bit.ly/3qDCt8z
+
+which should conclude at:
+
+https://spamassassin.apache.org/

Added: spamassassin/trunk/t/decodeshorturl.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/decodeshorturl.t?rev=1890251&view=auto
==============================================================================
--- spamassassin/trunk/t/decodeshorturl.t (added)
+++ spamassassin/trunk/t/decodeshorturl.t Thu May 27 17:35:21 2021
@@ -0,0 +1,45 @@
+#!/usr/bin/perl -T
+
+use lib '.'; use lib 't';
+use SATest; sa_t_init("decodeshorturl");
+
+use Test::More;
+
+plan skip_all => "Net tests disabled"                unless conf_bool('run_net_tests');
+plan tests => 2;
+
+tstpre ("
+loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
+");
+
+tstprefs("
+$default_cf_lines
+
+url_shortener bit.ly
+url_shortener tinyurl.com
+
+ifplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
+  body HAS_SHORT_URL              eval:short_url_tests()
+  describe HAS_SHORT_URL          Message contains one or more shortened URLs
+
+  body SHORT_URL_CHAINED          eval:short_url_tests()
+  describe SHORT_URL_CHAINED      Message has shortened URL chained to other shorteners
+endif
+");
+
+%patterns_url = (
+   q{ HAS_SHORT_URL } => 'Message contains one or more shortened URLs',
+            );
+
+%patterns_url_chain = (
+   q{ SHORT_URL_CHAINED } => 'Message has shortened URL chained to other shorteners',
+            );
+
+%patterns = %patterns_url;
+sarun ("-t < data/spam/decodeshorturl/base.eml", \&patterns_run_cb);
+ok_all_patterns();
+clear_pattern_counters();
+
+%patterns = %patterns_url_chain;
+sarun ("-t < data/spam/decodeshorturl/chain.eml", \&patterns_run_cb);
+ok_all_patterns();

Propchange: spamassassin/trunk/t/decodeshorturl.t
------------------------------------------------------------------------------
    svn:executable = *



Re: DecodeShortURLs plugin

Posted by Henrik K <he...@hege.li>.
On Fri, May 28, 2021 at 01:06:24PM +0300, Henrik K wrote:
> On Fri, May 28, 2021 at 09:52:25AM +0300, Henrik K wrote:
> > 
> > The current (legacy) way that DecodeShortURLs works need to be changed. 
> > Calling got_hit() for random rules that are not defined, especially things
> > like SHORT_C<SHORTENER>_C<CODE> should not be used.  One can't score them
> > properly and it also breaks the upcoming meta-rule logic.  All of the checks
> > need to have dedicated eval: rules.
> 
> Also I'm not sure what's the point of these rules
> 
> SHORT_URL_200
> SHORT_URL_404
> 
> If there's multiple shorteners in a message, you might get both results or
> maybe even neither.
> 
> Also at a quick glance, those cannot even hit if result is cached, since the
> cache doesn't store return codes.  And stuff like $self->{short_url_chained}
> isn't even reset, so it might keep hitting in new messages.  This module
> needs a lot of extra love.

I guess $self->{short_url_chained} should use $pms.  Giovanni, I assume you
will work on the plugin?



Re: DecodeShortURLs plugin

Posted by Benny Pedersen <me...@junc.eu>.
On 2021-05-28 17:43, Henrik K wrote:
> On Fri, May 28, 2021 at 05:29:11PM +0200, Benny Pedersen wrote:
>> 
>> only bayes with redis backend have expire data, why only redis ?, and 
>> why
>> only bayes ? :)
> 
> Redis has internal automatic expire, no effort required.  For others
> bayes_auto_expire works too - if it's slow, use Redis..

   bayes_token_ttl
     Controls token expiry (ttl value in SECONDS, sent as is to Redis)
     when bayes_auto_expire is true. Default value is 3 weeks (but check
     Mail::SpamAssassin::Conf.pm to make sure).
   bayes_seen_ttl
     Controls 'seen' expiry (ttl value in SECONDS, sent as is to Redis)
     when bayes_auto_expire is true. Default value is 8 days (but check
     Mail::SpamAssassin::Conf.pm to make sure).

redis only options imho

and no, postgresql is not slow, and still use less resources

Re: DecodeShortURLs plugin

Posted by Henrik K <he...@hege.li>.
On Fri, May 28, 2021 at 05:29:11PM +0200, Benny Pedersen wrote:
> 
> only bayes with redis backend have expire data, why only redis ?, and why
> only bayes ? :)

Redis has internal automatic expire, no effort required.  For others
bayes_auto_expire works too - if it's slow, use Redis..


Re: DecodeShortURLs plugin

Posted by Benny Pedersen <me...@junc.eu>.
On 2021-05-28 15:15, Henrik K wrote:
> On Fri, May 28, 2021 at 03:00:25PM +0200, Giovanni Bechis wrote:
>> 
>> cache must be invalidated every day with sql queries, to detect if a 
>> return code has changed for a cached
>> url, we should run a head request so cache will be of no use.
> 
> This should be automated too from the plugin, being a trivial call..  
> no SA
> module should require externals crons..

only bayes with redis backend have expire data, why only redis ?, and 
why only bayes ? :)

fun asside, i would like to see non cronned expire aswell

Re: DecodeShortURLs plugin

Posted by Henrik K <he...@hege.li>.
On Fri, May 28, 2021 at 03:00:25PM +0200, Giovanni Bechis wrote:
>
> cache must be invalidated every day with sql queries, to detect if a return code has changed for a cached
> url, we should run a head request so cache will be of no use.

This should be automated too from the plugin, being a trivial call..  no SA
module should require externals crons..


Re: DecodeShortURLs plugin

Posted by Giovanni Bechis <gi...@paclan.it>.
On 5/28/21 12:06 PM, Henrik K wrote:
> On Fri, May 28, 2021 at 09:52:25AM +0300, Henrik K wrote:
>>
>> The current (legacy) way that DecodeShortURLs works need to be changed. 
>> Calling got_hit() for random rules that are not defined, especially things
>> like SHORT_C<SHORTENER>_C<CODE> should not be used.  One can't score them
>> properly and it also breaks the upcoming meta-rule logic.  All of the checks
>> need to have dedicated eval: rules.
> 
> Also I'm not sure what's the point of these rules
> 
> SHORT_URL_200
> SHORT_URL_404
> 
> If there's multiple shorteners in a message, you might get both results or
> maybe even neither.
> 
> Also at a quick glance, those cannot even hit if result is cached, since the
> cache doesn't store return codes.
cache must be invalidated every day with sql queries, to detect if a return code has changed for a cached
url, we should run a head request so cache will be of no use.


> And stuff like $self->{short_url_chained}
> isn't even reset, so it might keep hitting in new messages.  This module
> needs a lot of extra love.
> 
I will take a look to improve it.

 Giovanni


Re: DecodeShortURLs plugin

Posted by Henrik K <he...@hege.li>.
On Fri, May 28, 2021 at 09:52:25AM +0300, Henrik K wrote:
> 
> The current (legacy) way that DecodeShortURLs works need to be changed. 
> Calling got_hit() for random rules that are not defined, especially things
> like SHORT_C<SHORTENER>_C<CODE> should not be used.  One can't score them
> properly and it also breaks the upcoming meta-rule logic.  All of the checks
> need to have dedicated eval: rules.

Also I'm not sure what's the point of these rules

SHORT_URL_200
SHORT_URL_404

If there's multiple shorteners in a message, you might get both results or
maybe even neither.

Also at a quick glance, those cannot even hit if result is cached, since the
cache doesn't store return codes.  And stuff like $self->{short_url_chained}
isn't even reset, so it might keep hitting in new messages.  This module
needs a lot of extra love.


DecodeShortURLs plugin

Posted by Henrik K <he...@hege.li>.
The current (legacy) way that DecodeShortURLs works need to be changed. 
Calling got_hit() for random rules that are not defined, especially things
like SHORT_C<SHORTENER>_C<CODE> should not be used.  One can't score them
properly and it also breaks the upcoming meta-rule logic.  All of the checks
need to have dedicated eval: rules.


On Thu, May 27, 2021 at 05:35:21PM -0000, gbechis@apache.org wrote:
> Author: gbechis
> Date: Thu May 27 17:35:21 2021
> New Revision: 1890251
> 
> URL: http://svn.apache.org/viewvc?rev=1890251&view=rev
> Log:
> Add DecodeShortURLs plugin to check urls hidden behind url shorteners
> 
> Added:
>     spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
>     spamassassin/trunk/sql/decodeshorturl_mysql.sql
>     spamassassin/trunk/sql/decodeshorturl_sqlite.sql
>     spamassassin/trunk/t/data/spam/decodeshorturl/
>     spamassassin/trunk/t/data/spam/decodeshorturl/base.eml
>     spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml
>     spamassassin/trunk/t/decodeshorturl.t   (with props)
> Modified:
>     spamassassin/trunk/MANIFEST
>     spamassassin/trunk/rules/v400.pre
> 
> Modified: spamassassin/trunk/MANIFEST
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1890251&r1=1890250&r2=1890251&view=diff
> ==============================================================================
> --- spamassassin/trunk/MANIFEST (original)
> +++ spamassassin/trunk/MANIFEST Thu May 27 17:35:21 2021
> @@ -82,6 +82,7 @@ lib/Mail/SpamAssassin/Plugin/BodyEval.pm
>  lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
>  lib/Mail/SpamAssassin/Plugin/Check.pm
>  lib/Mail/SpamAssassin/Plugin/DCC.pm
> +lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
>  lib/Mail/SpamAssassin/Plugin/DKIM.pm
>  lib/Mail/SpamAssassin/Plugin/DNSEval.pm
>  lib/Mail/SpamAssassin/Plugin/Esp.pm
> @@ -227,6 +228,8 @@ sql/awl_mysql.sql
>  sql/awl_pg.sql
>  sql/bayes_mysql.sql
>  sql/bayes_pg.sql
> +sql/decodeshorturl_mysql.sql
> +sql/decodeshorturl_sqlite.sql
>  sql/userpref_mysql.sql
>  sql/userpref_pg.sql
>  sql/txrep_mysql.sql
> @@ -405,6 +408,8 @@ t/data/spam/badmime3.txt
>  t/data/spam/base64.txt
>  t/data/spam/bsmtp
>  t/data/spam/bsmtpnull
> +t/data/spam/decodeshorturl/base.eml
> +t/data/spam/decodeshorturl/chain.eml
>  t/data/spam/dnsbl.eml
>  t/data/spam/dnsbl_domsonly.eml
>  t/data/spam/dnsbl_ipsonly.eml
> @@ -473,6 +478,7 @@ t/db_based_whitelist.t
>  t/db_based_whitelist_ips.t
>  t/dcc.t
>  t/debug.t
> +t/decodeshorturl.t
>  t/desc_wrap.t
>  t/dkim.t
>  t/dnsbl.t
> 
> Added: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm?rev=1890251&view=auto
> ==============================================================================
> --- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm (added)
> +++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/DecodeShortURLs.pm Thu May 27 17:35:21 2021
> @@ -0,0 +1,526 @@
> +# <@LICENSE>
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to you under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at:
> +# 
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +# 
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +# </...@LICENSE>
> +
> +# Original code by Steve Freegard <st...@fsl.com>
> +
> +=head1 NAME
> +
> +DecodeShortURLs - Expand shortened URLs
> +
> +=head1 SYNOPSIS
> +
> +  loadplugin    Mail::SpamAssassin::Plugin::DecodeShortURLs
> +
> +  url_shortener bit.ly
> +  url_shortener go.to
> +  ...
> +
> +=head1 DESCRIPTION
> +
> +This plugin looks for URLs shortened by a list of URL shortening services and
> +upon finding a matching URL will connect using to the shortening service and
> +do an HTTP HEAD lookup and retrieve the location header which points to the
> +actual shortened URL, it then adds this URL to the list of URIs extracted by
> +SpamAssassin which can then be accessed by other plug-ins, such as URIDNSBL.
> +
> +This plugin also sets the rule HAS_SHORT_URL if any matching short URLs are
> +found.
> +
> +Regular 'uri' rules can be used to detect and score links disabled by the
> +shortening service for abuse and URL_BITLY_BLOCKED is supplied as an example.
> +It should be safe to score this rule highly on a match as experience shows
> +that bit.ly only blocks access to a URL if it has seen consistent abuse and
> +problem reports.
> +
> +This plug-in will follow 'chained' shorteners e.g.
> +from short URL to short URL to short URL and finally to the real URL
> +
> +
> +If this form of chaining is found, then the rule 'SHORT_URL_CHAINED' will be
> +fired.  If a loop is detected then 'SHORT_URL_LOOP' will be fired.
> +This plug-in limits the number of chained shorteners to a maximim of 10 at
> +which point it will fire the rule 'SHORT_URL_MAXCHAIN' and go no further.
> +
> +If a shortener returns a '404 Not Found' result for the short URL then the
> +rule 'SHORT_URL_404' will be fired.
> +
> +If a shortener does not return an HTTP redirect, then a dynamic rule will
> +be fired: 'SHORT_C<SHORTENER>_C<CODE>' where C<SHORTENER> is the uppercase
> +name of the shortener with dots converted to underscores.  e.g.:
> +'SHORT_T_CO_200' This is to handle the case of t.co which now returns an
> +HTTP 200 and an abuse page instead of redirecting to an abuse page like
> +every other shortener does...
> +
> +=head1 NOTES
> +
> +This plugin runs the parsed_metadata hook with a priority of -1 so that
> +it may modify the parsed URI list prior to the URIDNSBL plugin which
> +runs as priority 0.
> +
> +Currently the plugin queries a maximum of 10 distinct shortened URLs with
> +a maximum timeout of 5 seconds per lookup.
> +
> +=head1 ACKNOWLEDGEMENTS
> +
> +A lot of this plugin has been hacked together by using other plugins as
> +examples.  The author would particularly like to tip his hat to Karsten
> +BrÃ??ckelmann for his work on GUDO.pm, the original version of this plugin
> +could not have been developed without his code.
> +
> +=cut
> +
> +package Mail::SpamAssassin::Plugin::DecodeShortURLs;
> +
> +my $VERSION = 0.11;
> +
> +use Mail::SpamAssassin::Plugin;
> +use strict;
> +use warnings;
> +
> +use vars qw(@ISA);
> +@ISA = qw(Mail::SpamAssassin::Plugin);
> +
> +use constant HAS_LWP_USERAGENT => eval { local $SIG{'__DIE__'}; require LWP::UserAgent; };
> +use constant HAS_DBI => eval { local $SIG{'__DIE__'}; require DBI; };
> +
> +sub dbg {
> +  my $msg = shift;
> +  return Mail::SpamAssassin::Logger::dbg("DecodeShortURLs: $msg");
> +}
> +
> +sub new {
> +  my $class = shift;
> +  my $mailsaobject = shift;
> +
> +  $class = ref($class) || $class;
> +  my $self = $class->SUPER::new($mailsaobject);
> +  bless ($self, $class);
> +
> +  if ($mailsaobject->{local_tests_only} || !HAS_LWP_USERAGENT) {
> +    $self->{disabled} = 1;
> +  } else {
> +    $self->{disabled} = 0;
> +  }
> +
> +  unless ($self->{disabled}) {
> +    $self->{ua} = new LWP::UserAgent;
> +    $self->{ua}->{max_redirect} = 0;
> +    $self->{ua}->{timeout} = 5;
> +    $self->{ua}->env_proxy;
> +    $self->{caching} = 0;
> +  }
> +
> +  $self->set_config($mailsaobject->{conf});
> +  $self->register_method_priority ('parsed_metadata', -1);
> +  $self->register_eval_rule('short_url_tests');
> +
> +  return $self;
> +}
> +
> +sub set_config {
> +  my($self, $conf) = @_;
> +  my @cmds = ();
> +
> +  push (@cmds, {
> +    setting => 'url_shortener',
> +    default => {},
> +    code => sub {
> +      my ($self, $key, $value, $line) = @_;
> +      if ($value =~ /^$/) {
> +        return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
> +      }
> +      foreach my $domain (split(/\s+/, $value)) {
> +        $self->{url_shorteners}->{lc $domain} = 1;
> +      }
> +    }
> +  });
> +
> +=cut
> +
> +=head1 PRIVILEGED SETTINGS
> +
> +=over 4
> +
> +=item url_shortener_cache_type     (default: none)
> +
> +The specific type of cache type that is being utilized. Currently only sqlite
> +is configured however plans to support redis cache is planned.
> +
> +Example:
> +url_shortener_cache_type sqlite
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'url_shortener_cache_type',
> +    default => undef,
> +    is_priv => 1,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
> +  });
> +
> +=cut
> +
> +=item url_shortener_cache_dsn		(default: none)
> +
> +The dsn to a database file to write cache entries to.  The database will
> +be created automatically if is does not already exist but the supplied path
> +and file must be read/writable by the user running spamassassin or spamd.
> +
> +Note: You will need to have the proper DBI version of the cache type installed.
> +
> +Example:
> +
> +url_shortener_cache_dsn dbi:SQLite:dbname=/tmp/DecodeShortURLs.sq3
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'url_shortener_cache_dsn',
> +    default => '',
> +    is_priv => 1,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
> +  });
> +
> +=cut
> +
> +=item url_shortener_cache_username  (default: none)
> +
> +The username that should be used to connect to the database.
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'url_shortener_cache_username',
> +    default => '',
> +    is_priv => 1,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
> +  });
> +
> +=cut
> +
> +=item url_shortener_cache_password  (default: none)
> +
> +The password that should be used to connect to the database.
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'url_shortener_cache_password',
> +    default => '',
> +    is_priv => 1,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
> +  });
> +
> +=cut
> +
> +=item url_shortener_cache_ttl		(default: 86400)
> +
> +The length of time a cache entry will be valid for in seconds.
> +Default is 86400 (1 day).
> +
> +NOTE: you will also need to run the following via cron to actually remove the
> +records from the database:
> +
> +echo "DELETE FROM short_url_cache WHERE modified < NOW() - C<ttl>; | sqlite3 /path/to/database"
> +
> +NOTE: replace C<ttl> above with the same value you use for this option
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'url_shortener_cache_ttl',
> +    is_admin => 1,
> +    default => 86400,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> +  });
> +
> +=cut
> +
> +=item url_shortener_loginfo           (default: 0 (off))
> +
> +If this option is enabled (set to 1), then short URLs and the decoded URLs will be logged with info priority.
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'url_shortener_loginfo',
> +    is_admin => 1,
> +    default => 0,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
> +  });
> +
> +=cut
> +
> +=item max_short_urls                 (default: 10)
> +
> +The max depth of short urls that will be chained until it stops looking further.
> +
> +=cut
> +
> +  push (@cmds, {
> +    setting => 'max_short_urls',
> +    is_admin => 1,
> +    default => 10,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> +  });
> +
> +  $conf->{parser}->register_commands(\@cmds);
> +}
> +
> +sub initialise_url_shortener_cache {
> +  my($self, $opts) = @_;
> +
> +  if($self->{url_shortener_cache_type} eq "dbi" && defined $self->{url_shortener_cache_dsn} && HAS_DBI) {
> +    $self->{url_shortener_dbi_cache} = $self->{url_shortener_cache_dsn};
> +    return _connect_dbi_cache($self, $opts);
> +  } else {
> +    warn "Wrong cache type selected";
> +    return undef;
> +  }
> +}
> +
> +sub _connect_dbi_cache {
> +  my($self, $opts) = @_;
> +
> +  # Initialise cache if enabled
> +  if ($self->{url_shortener_dbi_cache} && HAS_DBI) {
> +    eval {
> +      local $SIG{'__DIE__'};
> +      $self->{dbh} = DBI->connect_cached($self->{url_shortener_cache_dsn},$self->{url_shortener_cache_username},$self->{url_shortener_cache_password}, {RaiseError => 1, PrintError => 0, InactiveDestroy => 1}) or die $!;
> +    };
> +    if ($@) {
> +      dbg("warn: $@");
> +    } else {
> +      $self->{caching} = 1;
> +    }
> +  }
> +}
> +
> +sub parsed_metadata {
> +  my ($self, $opts) = @_;
> +  my $pms = $opts->{permsgstatus};
> +  my $msg = $opts->{msg};
> +
> +  return if $self->{disabled};
> +
> +  dbg ('warn: get_uri_detail_list() has been called already')
> +    if exists $pms->{uri_detail_list};
> +
> +  # don't keep dereferencing these
> +  $self->{url_shorteners} = $pms->{main}->{conf}->{url_shorteners};
> +  if(defined $pms->{main}->{conf}->{url_shortener_cache_type}) {
> +    $self->{url_shortener_cache_type} = $pms->{main}->{conf}->{url_shortener_cache_type};
> +    $self->{url_shortener_cache_dsn} = $pms->{main}->{conf}->{url_shortener_cache_dsn};
> +    $self->{url_shortener_cache_username} = $pms->{main}->{conf}->{url_shortener_cache_username};
> +    $self->{url_shortener_cache_password} = $pms->{main}->{conf}->{url_shortener_cache_password};
> +    $self->{url_shortener_cache_ttl} = $pms->{main}->{conf}->{url_shortener_cache_ttl};
> +  }
> +  $self->{url_shortener_loginfo} = $pms->{main}->{conf}->{url_shortener_loginfo};
> +
> +  # Sort short URLs into hash to de-dup them
> +  my %short_urls;
> +  my $uris = $pms->get_uri_detail_list();
> +  while (my($uri, $info) = each %{$uris}) {
> +    next unless ($info->{domains});
> +    foreach ( keys %{ $info->{domains} } ) {
> +      if (exists $self->{url_shorteners}->{lc $_}) {
> +        # NOTE: $info->{domains} appears to contain all the domains parsed
> +        # from the single input URI with no way to work out what the base
> +        # domain is.  So to prevent someone from stuffing the URI with a
> +        # shortener to force this plug-in to follow a link that *isn't* on
> +        # the list of shorteners; we enforce that the shortener must be the
> +        # base URI and that a path must be present.
> +        if ($uri !~ /^https?:\/\/(?:www\.)?$_\/.+$/i) {
> +          dbg("Discarding URI: $uri");
> +          next;
> +        }
> +        $short_urls{$uri} = 1;
> +        next;
> +      }
> +    }
> +  }
> +
> +  # Make sure we have some work to do
> +  # Before we open any log files etc.
> +  my $count = scalar keys %short_urls;
> +  return undef unless $count gt 0;
> +
> +  initialise_url_shortener_cache($self, $opts) if defined $self->{url_shortener_cache_type};
> +
> +  my $max_short_urls = $pms->{main}->{conf}->{max_short_urls};
> +  foreach my $short_url (keys %short_urls) {
> +    next if ($max_short_urls le 0);
> +    my $location = $self->recursive_lookup($short_url, $pms);
> +    $max_short_urls--;
> +  }
> +}
> +
> +sub recursive_lookup {
> +  my ($self, $short_url, $pms, %been_here) = @_;
> +
> +  my $count = scalar keys %been_here;
> +  dbg("Redirection count $count") if $count gt 0;
> +  if ($count >= 10) {
> +    dbg("Error: more than 10 shortener redirections");
> +    # Fire test
> +    $pms->got_hit('SHORT_URL_MAXCHAIN');
> +    return undef;
> +  }
> +
> +  my $location;
> +  if ($self->{caching} && ($location = $self->cache_get($short_url))) {
> +    dbg("Found cached $short_url => $location");
> +    if($self->{url_shortner_loginfo}) {
> +      info("Found cached $short_url => $location");
> +    } else {
> +      dbg("Found cached $short_url => $location");
> +    }
> +  } else {
> +    # Not cached; do lookup
> +    my $response = $self->{ua}->head($short_url);
> +    if (!$response->is_redirect) {
> +      dbg("URL is not redirect: $short_url = ".$response->status_line);
> +      if ((my ($domain) = ($short_url =~ /^https?:\/\/(\S+)\//))) {
> +          if (exists $self->{url_shorteners}->{$domain}) {
> +              $domain =~ s/\./_/g;
> +              $domain = uc($domain);
> +              my $h = 'SHORT_' . $domain . '_' . $response->code;
> +              dbg("hit rule: $h");
> +              $pms->got_hit($h);
> +          }
> +      }
> +      $pms->got_hit('SHORT_URL_404') if($response->code == '404');
> +      return undef;
> +    }
> +    $location = $response->headers->{location};
> +    # Bail out if $short_url redirects to itself
> +    return undef if ($short_url eq $location);
> +    $self->cache_add($short_url, $location) if $self->{caching};
> +    dbg("Found $short_url => $location");
> +    if($self->{url_shortener_loginfo}) {
> +      info("Found $short_url => $location");
> +    } else {
> +      dbg("Found $short_url => $location");
> +    }
> +  }
> +
> +  # At this point we have a new URL in $response
> +  $pms->got_hit('HAS_SHORT_URL');
> +  $pms->add_uri_detail_list($location);
> +
> +  # Set chained here otherwise we might mark a disabled page or
> +  # redirect back to the same host as chaining incorrectly.
> +  $pms->got_hit('SHORT_URL_CHAINED') if ($count gt 0);
> +
> +  # Check if we are being redirected to a local page
> +  # Don't recurse in this case...
> +  if($location !~ /^https?:/) {
> +    my($host) = ($short_url =~ /^(https?:\/\/\S+)\//);
> +    $location = "$host/$location";
> +    dbg("Looks like a local redirection: $short_url => $location");
> +    $pms->add_uri_detail_list($location);
> +    return $location;
> +  }
> +
> +  # Check for recursion
> +  if ((my ($domain) = ($location =~ /^https?:\/\/(\S+)\//))) {
> +    if (exists $been_here{$location}) {
> +      # Loop detected
> +      dbg("Error: loop detected");
> +      $pms->got_hit('SHORT_URL_LOOP');
> +      return $location;
> +    } else {
> +      if (exists $self->{url_shorteners}->{$domain}) {
> +        $been_here{$location} = 1;
> +        # Recurse...
> +        return $self->recursive_lookup($location, $pms, %been_here);
> +      }
> +    }
> +  }
> +
> +  # No recursion; just return the final location...
> +  return $location;
> +}
> +
> +sub short_url_tests {
> +  # Set by parsed_metadata
> +  return 0;
> +}
> +
> +sub cache_add {
> +  my ($self, $short_url, $decoded_url) = @_;
> +  return undef if not $self->{caching};
> +
> +  eval {
> +    $self->{sth_insert} = $self->{dbh}->prepare_cached("
> +      INSERT INTO short_url_cache (short_url, decoded_url, created, modified)
> +      VALUES (?,?,?,?)
> +    ");
> +  };
> +  if ($@) {
> +    dbg("warn: $@");
> +    return undef;
> +  };
> +
> +  $self->{sth_insert}->execute($short_url, $decoded_url, time(), time());
> +  return undef;
> +}
> +
> +sub cache_get {
> +  my ($self, $key) = @_;
> +  return undef if not $self->{caching};
> +
> +  eval {
> +    $self->{sth_select} = $self->{dbh}->prepare_cached("
> +      SELECT decoded_url FROM short_url_cache
> +      WHERE short_url = ? AND modified > ?
> +    ");
> +  };
> +  if ($@) {
> +   dbg("warn: $@");
> +   return undef;
> +  }
> +
> +  eval {
> +    $self->{sth_update} = $self->{dbh}->prepare_cached("
> +      UPDATE short_url_cache
> +      SET modified=?, hits=hits+1
> +      WHERE short_url = ?
> +    ");
> +  };
> +  if ($@) {
> +   dbg("warn: $@");
> +   return undef;
> +  }
> +
> +  my $tcheck = time() - $self->{url_shortener_cache_ttl};
> +  $self->{sth_select}->execute($key, $tcheck);
> +  my $row = $self->{sth_select}->fetchrow_array();
> +  if($row) {
> +    # Found cache entry; touch it to prevent expiry
> +    $self->{sth_update}->execute(time(),$key);
> +    $self->{sth_select}->finish();
> +    $self->{sth_update}->finish();
> +    return $row;
> +  }
> +
> +  $self->{sth_select}->finish();
> +  $self->{sth_update}->finish();
> +  return undef;
> +}
> +
> +1;
> 
> Modified: spamassassin/trunk/rules/v400.pre
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/rules/v400.pre?rev=1890251&r1=1890250&r2=1890251&view=diff
> ==============================================================================
> --- spamassassin/trunk/rules/v400.pre (original)
> +++ spamassassin/trunk/rules/v400.pre Thu May 27 17:35:21 2021
> @@ -29,3 +29,7 @@
>  # into SpamAssassin
>  #
>  # loadplugin Mail::SpamAssassin::Plugin::ExtractText
> +
> +# DecodeShortUrl - Decode url from url shorteners
> +#
> +# loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
> 
> Added: spamassassin/trunk/sql/decodeshorturl_mysql.sql
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_mysql.sql?rev=1890251&view=auto
> ==============================================================================
> --- spamassassin/trunk/sql/decodeshorturl_mysql.sql (added)
> +++ spamassassin/trunk/sql/decodeshorturl_mysql.sql Thu May 27 17:35:21 2021
> @@ -0,0 +1,10 @@
> +CREATE TABLE `short_url_cache`
> +( `short_url` VARCHAR(256) NOT NULL ,
> +  `decoded_url` VARCHAR(512) NOT NULL ,
> +  `hits` MEDIUMINT NOT NULL DEFAULT 1,
> +  `created` INT(11) NOT NULL ,
> +  `modified` INT(11) NOT NULL ,
> +  PRIMARY KEY (`short_url`)
> +) ENGINE = InnoDB;
> +ALTER TABLE `spam_bayes`.`short_url_cache` ADD INDEX `short_url_by_modified` (`short_url`, `modified`);
> +ALTER TABLE `spam_bayes`.`short_url_cache` ADD INDEX `short_url_modified` (`modified`);
> 
> Added: spamassassin/trunk/sql/decodeshorturl_sqlite.sql
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/decodeshorturl_sqlite.sql?rev=1890251&view=auto
> ==============================================================================
> --- spamassassin/trunk/sql/decodeshorturl_sqlite.sql (added)
> +++ spamassassin/trunk/sql/decodeshorturl_sqlite.sql Thu May 27 17:35:21 2021
> @@ -0,0 +1,11 @@
> +CREATE TABLE short_url_cache (
> +  short_url   TEXT PRIMARY KEY NOT NULL,
> +  decoded_url TEXT NOT NULL,
> +  hits        INTEGER NOT NULL DEFAULT 1,
> +  created     INTEGER NOT NULL ,
> +  modified    INTEGER NOT NULL
> +);
> +CREATE INDEX short_url_by_modified
> +  ON short_url_cache(short_url, modified);
> +CREATE INDEX short_url_modified
> +  ON short_url_cache(modified);
> 
> Added: spamassassin/trunk/t/data/spam/decodeshorturl/base.eml
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/data/spam/decodeshorturl/base.eml?rev=1890251&view=auto
> ==============================================================================
> --- spamassassin/trunk/t/data/spam/decodeshorturl/base.eml (added)
> +++ spamassassin/trunk/t/data/spam/decodeshorturl/base.eml Thu May 27 17:35:21 2021
> @@ -0,0 +1,13 @@
> +To: Entity <en...@example.com>
> +From: Example <ex...@example.com>
> +Subject: This is a test email for a shortened URL
> +Message-ID: <ea...@example.com>
> +Date: Tue, 10 Nov 2020 13:33:08 -0500
> +
> +Greetings,
> +
> +http://bit.ly/30yH6WK
> +
> +which should link to:
> +
> +https://spamassassin.apache.org/
> 
> Added: spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml?rev=1890251&view=auto
> ==============================================================================
> --- spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml (added)
> +++ spamassassin/trunk/t/data/spam/decodeshorturl/chain.eml Thu May 27 17:35:21 2021
> @@ -0,0 +1,17 @@
> +To: Entity <en...@example.com>
> +From: Example <ex...@example.com>
> +Subject: This is a test email for a shortened URL
> +Message-ID: <ea...@example.com>
> +Date: Tue, 10 Nov 2020 13:33:08 -0500
> +
> +Greetings,
> +
> +https://tinyurl.com/jf8wv76t
> +
> +which should link to:
> +
> +http://bit.ly/3qDCt8z
> +
> +which should conclude at:
> +
> +https://spamassassin.apache.org/
> 
> Added: spamassassin/trunk/t/decodeshorturl.t
> URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/decodeshorturl.t?rev=1890251&view=auto
> ==============================================================================
> --- spamassassin/trunk/t/decodeshorturl.t (added)
> +++ spamassassin/trunk/t/decodeshorturl.t Thu May 27 17:35:21 2021
> @@ -0,0 +1,45 @@
> +#!/usr/bin/perl -T
> +
> +use lib '.'; use lib 't';
> +use SATest; sa_t_init("decodeshorturl");
> +
> +use Test::More;
> +
> +plan skip_all => "Net tests disabled"                unless conf_bool('run_net_tests');
> +plan tests => 2;
> +
> +tstpre ("
> +loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
> +");
> +
> +tstprefs("
> +$default_cf_lines
> +
> +url_shortener bit.ly
> +url_shortener tinyurl.com
> +
> +ifplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
> +  body HAS_SHORT_URL              eval:short_url_tests()
> +  describe HAS_SHORT_URL          Message contains one or more shortened URLs
> +
> +  body SHORT_URL_CHAINED          eval:short_url_tests()
> +  describe SHORT_URL_CHAINED      Message has shortened URL chained to other shorteners
> +endif
> +");
> +
> +%patterns_url = (
> +   q{ HAS_SHORT_URL } => 'Message contains one or more shortened URLs',
> +            );
> +
> +%patterns_url_chain = (
> +   q{ SHORT_URL_CHAINED } => 'Message has shortened URL chained to other shorteners',
> +            );
> +
> +%patterns = %patterns_url;
> +sarun ("-t < data/spam/decodeshorturl/base.eml", \&patterns_run_cb);
> +ok_all_patterns();
> +clear_pattern_counters();
> +
> +%patterns = %patterns_url_chain;
> +sarun ("-t < data/spam/decodeshorturl/chain.eml", \&patterns_run_cb);
> +ok_all_patterns();
> 
> Propchange: spamassassin/trunk/t/decodeshorturl.t
> ------------------------------------------------------------------------------
>     svn:executable = *
>