You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/12/14 01:41:05 UTC

svn commit: r356657 - in /spamassassin/trunk: MANIFEST MANIFEST.SKIP build/listpromotable build/mkrules masses/parse-rules-for-masses rules/20_html_tests.cf rules/active.list

Author: jm
Date: Tue Dec 13 16:41:01 2005
New Revision: 356657

URL: http://svn.apache.org/viewcvs?rev=356657&view=rev
Log:
build/mkrules now compiles 'good enough' rules to rules/72_active.cf, instead of copying all core rules into rules.  it also picks rules from sandboxes as part of this.  Not-promotable rules are left in rules/70_sandbox.cf.  It also follows meta dependencies correctly to ensure that each rule is in a consistent state even without the 72_sandbox.cf file.

Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/MANIFEST.SKIP
    spamassassin/trunk/build/listpromotable
    spamassassin/trunk/build/mkrules
    spamassassin/trunk/masses/parse-rules-for-masses
    spamassassin/trunk/rules/20_html_tests.cf
    spamassassin/trunk/rules/active.list

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Tue Dec 13 16:41:01 2005
@@ -417,48 +417,27 @@
 tools/triplets.pl
 build/mkrules
 rules/10_default_prefs.cf
-rules/20_advance_fee.cf
-rules/20_body_tests.cf
-rules/20_compensate.cf
 rules/20_dnsbl_tests.cf
-rules/20_drugs.cf
-rules/20_fake_helo_tests.cf
-rules/20_head_tests.cf
 rules/20_html_tests.cf
-rules/20_meta_tests.cf
 rules/20_net_tests.cf
-rules/20_phrases.cf
-rules/20_porn.cf
-rules/20_ratware.cf
-rules/20_uri_tests.cf
 rules/23_bayes.cf
 rules/25_accessdb.cf
 rules/25_antivirus.cf
-rules/25_body_tests_es.cf
-rules/25_body_tests_pl.cf
 rules/25_dcc.cf
 rules/25_domainkeys.cf
 rules/25_hashcash.cf
 rules/25_pyzor.cf
 rules/25_razor2.cf
-rules/25_replace.cf
 rules/25_spf.cf
 rules/25_textcat.cf
 rules/25_uribl.cf
-rules/30_text_de.cf
-rules/30_text_fr.cf
-rules/30_text_it.cf
-rules/30_text_nl.cf
-rules/30_text_pl.cf
-rules/30_text_pt_br.cf
-rules/50_scores.cf
 rules/60_awl.cf
 rules/60_whitelist.cf
-rules/60_whitelist_spf.cf
 rules/60_whitelist_subject.cf
 rules/70_broken_rules.cf
 rules/70_sandbox.cf
 rules/70_uribl.cf
+rules/72_active.cf
 rules/sandbox-felicity.pm
 rules/STATISTICS-set0.txt
 rules/STATISTICS-set1.txt
@@ -472,3 +451,5 @@
 rules/triplets.txt
 rules/user_prefs.template
 rules/v310.pre
+build/listpromotable
+rules/active.list

Modified: spamassassin/trunk/MANIFEST.SKIP
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST.SKIP?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST.SKIP (original)
+++ spamassassin/trunk/MANIFEST.SKIP Tue Dec 13 16:41:01 2005
@@ -114,3 +114,4 @@
 ^rules/70_sandbox.cf$
 ^build/automc/
 ^rulesrc/.*$
+^rules/active.list$

Modified: spamassassin/trunk/build/listpromotable
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/build/listpromotable?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/build/listpromotable (original)
+++ spamassassin/trunk/build/listpromotable Tue Dec 13 16:41:01 2005
@@ -112,9 +112,6 @@
   next unless ($mailsa->{conf}->{descriptions}->{$name}
         || $mailsa->{conf}->{scores}->{$name});
 
-  # ignore rules that are not marked as promotable
-  next unless ($obj->{promo});
-
   # "nopublish" tflags
   my $tfs = $mailsa->{conf}->{tflags}->{$name};
   if ($tfs) {
@@ -124,7 +121,14 @@
       $notes = "tflags publish";
       goto publish;
     }
+    if ($tfs =~ /\buserconf\b/) {
+      $notes = "tflags userconf";
+      goto publish;
+    }
   }
+
+  # ignore rules that are not marked as promotable
+  next unless ($obj->{promo});
 
   # only rules from "rulesrc" dirs
   my $src = $mailsa->{conf}->{source_file}->{$name};

Modified: spamassassin/trunk/build/mkrules
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/build/mkrules?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/build/mkrules (original)
+++ spamassassin/trunk/build/mkrules Tue Dec 13 16:41:01 2005
@@ -37,14 +37,16 @@
 
 use vars qw(
     @opt_srcs $opt_out $opt_sandboxout $opt_manifest
-    $opt_manifestskip $opt_listpromotable
+    $opt_manifestskip $opt_listpromotable $opt_active
+    $opt_activeout
 );
 GetOptions("src=s" => \@opt_srcs,
     "out=s",
     "sandboxout=s",
+    "activeout=s",
+    "active=s",
     "manifest=s",
     "manifestskip=s",
-    "listpromotable=s",
   );
 
 if (!@opt_srcs) {
@@ -64,19 +66,20 @@
   $opt_manifestskip = "MANIFEST.SKIP";
 }
 
+if (!$opt_active && -f "rules/active.list") {
+  $opt_active = "rules/active.list";
+}
+
 die "no src" unless (@opt_srcs >= 1);
 
 my $promolist;
-if ($opt_listpromotable) {
-  my $list = do $opt_listpromotable;
-  die "no listpromotable list" unless $list;
-  $promolist = $list;
-}
-else {
+
   die "no out" unless ($opt_out);
   die "unreadable out" unless (-d $opt_out);
-}
+  die "unreadable active" unless (-f $opt_active);
+
 $opt_sandboxout ||= "70_sandbox.cf";
+$opt_activeout  ||= "72_active.cf";
 
 # source files that need compilation, and their targets
 my $needs_compile = { };
@@ -95,6 +98,8 @@
         }, $src);
 }
 
+my $rules = { };
+
 my $file_manifest = { };
 my $file_manifest_skip = [ ];
 if ($opt_manifest) {
@@ -104,6 +109,9 @@
   read_manifest_skip($opt_manifestskip);
 }
 
+my $active_rules = { };
+read_active($opt_active);
+
 # context for the rules compiler
 my $seen_rules = { };
 my $renamed_rules = { };
@@ -111,20 +119,26 @@
 my $output_file_text = { };
 my $files_to_lint = { };
 
-compile_sorted($needs_compile);
+# $COMMENTS is a "catch-all" "name", for lines that appear after the last line
+# that refers to a rule by name.  Those lines are not published by themselves;
+# they'll be published to all pubfiles found in the file.
+#
+# It's assumed they are comments, because they generally are, but could be all
+# sorts of unparseable lines.
+my $COMMENTS = '!comments!';
+
+# another "fake name" for lines that should always be published.  They'll
+# be published to all files, in duplicate.  This should probably be improved
+# somehow, TODO.
+my $ALWAYS_PUBLISH = '!always_publish!';
+
+read_all_rules($needs_compile);
+compile_output_files();
 lint_output_files();
+write_output_files();
 
-if ($opt_listpromotable) {
-  list_promotable();
-}
-else {
-  write_output_files();
-}
 exit;
 
-sub list_promotable {
-}
-
 # ---------------------------------------------------------------------------
 
 sub lint_output_files {
@@ -146,8 +160,22 @@
       dont_copy_prefs => 1,
       config_text => $text
   });
-  my $res = $mailsa->lint_rules();
-  return $res;       # 0 means good
+
+  my $errors = 0;
+  $mailsa->{lint_callback} = sub {
+    my %opts = @_;
+
+    return if ($opts{msg} =~ /
+          (?:score\sset\sfor\snon-existent|description\sexists)
+      /x);
+
+    warn "lint: $opts{msg}";
+    if ($opts{iserror}) {
+      $errors++;
+    }
+  };
+
+  return $errors;       # 0 means good
 }
 
 sub wanted {
@@ -169,10 +197,7 @@
 
   my $f = "$current_src/$dir$filename";
   my $t;
-
-  if (!$opt_listpromotable) {
-    $t = "$opt_out/$filename";
-  }
+  $t = "$opt_out/$filename";
 
   $needs_compile->{$f} = {
           f => $f,
@@ -184,7 +209,7 @@
 
 # compile all the source files found by the wanted() sub, in sorted
 # order so that the order of precedence makes sense.
-sub compile_sorted {
+sub read_all_rules {
   my ($sources) = @_;
 
   # deal with the perl modules first, so that later linting w/ loadplugin will
@@ -211,7 +236,7 @@
       plugin_file_compile($entry);
     }
     elsif ($entry->{dir} =~ /sandbox/) {
-      rule_file_compile_sandbox($f, $t, $entry->{filename});
+      rule_file_compile($f, $t, $entry->{filename}, 1);
     }
     elsif ($entry->{dir} =~ /extra/) {
       # 'extra' rulesets; not built by default (TODO)
@@ -220,38 +245,28 @@
     else {
       # rules in "core" and "lang" are always copied
       if ($needs_rebuild) {
-        rule_file_compile_core($f, $t, $entry->{filename});
+        rule_file_compile($f, $t, $entry->{filename}, 0);
       }
     }
   }
 }
 
-# implement the validation criteria from
-# http://wiki.apache.org/spamassassin/RulesProjPromotion .
-#
-# Rules are compiled from source dir to output dir. All rules in "core" are
-# always promoted (for backwards compatibility). In addition, rules in the
-# sandboxes will be promoted, if the rules source file contains a "publish
-# core" command prior to that rule.  This command is added (by hand!) to the
-# source file by committers, as the rules pass the validation criteria.
-#
-# The compiler will copy the rules to the output directory. By default, the
-# filename is preserved; so a rule in a file called "20_foo.cf" in the source
-# directory will be output to the file "20_foo.cf".
-#
-# If the rule is not "publish"-tagged, it will be output as a testing rule
-# to "70_sandbox.cf".
-#
+###########################################################################
+
+# Rules are compiled from source dir to output dir.
+# 
+# Rules in "rules/active.list" are promoted to "72_active.cf"; rules not
+# listed there are relegated to "70_sandbox.cf".  There is code to allow
+# other filenames to be selected from the rulesrc .cf file, but I'm not
+# sure if it works anymore ;)
+# 
 # Rules will be autorenamed, if there's a collision between a new rule name and
 # one that's already been output by the compiler in another source file. The
 # autorenaming is very simple -- portions of the current source path are
 # appended to the rule name, sanitised.
 
-my $COMMENTS;
-my $ALWAYS_PUBLISH;
-
-sub rule_file_compile_sandbox {
-  my ($f, $t, $filename) = @_;
+sub rule_file_compile {
+  my ($f, $t, $filename, $issandbox) = @_;
 
   open (IN, "<$f") or die "cannot read $f";
 
@@ -259,23 +274,14 @@
   # full deal here, and it must be fast, since it's run on every
   # "make" invocation
 
-  my $rules = { };
   my $rule_order = [ ];
 
-  # $COMMENTS is a "catch-all" "name", for lines that appear after the last
-  # line that refers to a rule by name.  Those lines are not published by
-  # themselves; they'll be published to all pubfiles found in the file.
-  #
-  # It's assumed they are comments, because they generally are, but could be
-  # all sorts of unparseable lines.
-  $COMMENTS = '!comments!';
-
   my $lastrule = $COMMENTS;
 
-  # another "fake name" for lines that should always be published, to an
-  # output file with the same name as the input file.
-  $ALWAYS_PUBLISH = '!always_publish!';
-  $rules->{$ALWAYS_PUBLISH} = rule_entry_create();
+  if (!defined $rules->{$ALWAYS_PUBLISH}) {
+    $rules->{$ALWAYS_PUBLISH} = rule_entry_create();
+  }
+
   my $ALWAYS = { $ALWAYS_PUBLISH => 1 };
 
   # an "ifplugin" or "if" scope
@@ -314,7 +320,10 @@
       my $val = $3;
 
       my $origname = $name;
-      $name = sandbox_rule_name_avoid_collisions($name, $f);
+      if ($issandbox) {
+        $name = sandbox_rule_name_avoid_collisions($name, $f);
+      }
+      # non-sandbox rules always use the same name
 
       if (!$rules->{$name}) { $rules->{$name} = rule_entry_create(); }
       $rules->{$name}->{origname} = $origname;
@@ -338,7 +347,9 @@
       my $val = $3;
 
       my $origname = $name;
-      $name = sandbox_rule_name_avoid_collisions($name, $f);
+      if ($issandbox) {
+        $name = sandbox_rule_name_avoid_collisions($name, $f);
+      }
 
       if (!$rules->{$name}) { $rules->{$name} = rule_entry_create(); }
       $rules->{$name}->{origname} = $origname;
@@ -350,9 +361,8 @@
       }
       elsif ($command eq 'pubfile') {
         if (!filename_in_manifest($opt_out.'/'.$val)) {
-          my $sbout = $opt_out.'/'.$opt_sandboxout;
-          warn "$val: WARNING: not listed in manifest file, using $sbout\n";
-          $val = $sbout;
+          warn "$val: WARNING: not listed in manifest file, using default\n";
+          next;     # don't set 'pubfile' below
         }
       }
 
@@ -378,7 +388,10 @@
     else {
       my $NAME = $ALWAYS_PUBLISH;
       if ($current_conditional) {
+
         $NAME .= $current_conditional . '!';
+        $NAME =~ s/\n//gs;
+
         unless ($rules->{$NAME}) {
 	  $rules->{$NAME} = rule_entry_create();
 	  $ALWAYS->{$NAME} = 1;
@@ -397,7 +410,7 @@
   }
 
   # now append all the found text to the output file buffers
-  copy_to_output_buffers($rule_order, $rules, $ALWAYS, $f, $filename);
+  copy_to_output_buffers($rule_order, $issandbox, $ALWAYS, $f, $filename);
 
   # ok; file complete.  now mark all those rules as "seen"; future
   # refs to those rule names will trigger an autorename.
@@ -407,7 +420,7 @@
 }
 
 sub copy_to_output_buffers {
-  my ($rule_order, $rules, $ALWAYS, $f, $filename) = @_;
+  my ($rule_order, $issandbox, $ALWAYS, $f, $filename) = @_;
 
   my %already_done = ();
   my $copied = 0;
@@ -422,135 +435,54 @@
       next;     # nothing to write!
     }
 
-    if ($opt_listpromotable) {
-      promo_rule ($rules, $name, $text);
-    }
-    else {
-      copy_rule ($rules, $name, $text, $filename);
-      $copied++;
-    }
-  }
-
-  if (!$opt_listpromotable) {
-    print "$f: $copied sandbox rules copied\n";
-  }
-}
-
-sub copy_rule {
-  my ($rules, $name, $text, $filename) = @_;
-
-  my $f = $rules->{$name}->{srcfile};
-
-  my $pubfile;
-  if ($rules->{$name}->{publish}) {
-    $pubfile = ($rules->{$name}->{pubfile} || $filename);
-    $pubfile = $opt_out.'/'.$pubfile;
-  } else {
-    $pubfile = $opt_out.'/'.$opt_sandboxout;
-  }
-  $output_files->{$pubfile} = 1;
+    my $srcfile = $rules->{$name}->{srcfile};
+    my $pubfile = pubfile_for_rule($rules, $name);
 
-  if (!$output_file_text->{$pubfile}) {
-    $output_file_text->{$pubfile} = output_file_header($f);
-  }
+    $output_files->{$pubfile} = {
+      header => "",
+      # header => "# [compiled from '$srcfile']\n",
+      # don't use that header; we now have multiple srcfiles in each
+      # output file!
+    };
 
-  # fix up any rule renamings we were supposed to do
-  sed_renamed_rule_names(\$text);
+    # fix up any rule renamings we were supposed to do
+    sed_renamed_rule_names(\$text);
 
-  my $cond = $rules->{$name}->{cond};
-  if ($cond) {
-    $output_file_text->{$pubfile} .= $cond.$text."endif\n";
-  }
-  else {
-    $output_file_text->{$pubfile} .= $text;
-  }
+    my $cond = $rules->{$name}->{cond};
+    if ($cond) {
+      $rules->{$name}->{output_text} = "\n".$cond.$text."endif\n";
+    } else {
+      $rules->{$name}->{output_text} = $text;
+    }
+    $rules->{$name}->{output_file} = $pubfile;
 
-  # do we have any end-of-file comments?  if so, add it
-  my $cmts = $rules->{$COMMENTS}->{text};
-  if ($cmts) {
-    $output_file_text->{$pubfile} .= $cmts;
+    $copied++;
   }
 
-  $files_to_lint->{$pubfile} = 1;
+  print "$f: $copied ".
+    ($issandbox ? "sandbox" : "core")." rules copied\n";
 }
 
-sub promo_rule {
-  my ($rules, $name, $text) = @_;
-
-  return unless $promolist->{$name};
-  my $pent = $promolist->{$name};
-  return unless $pent->{promo};
-
-  $text =~ s/^\s+//s;
-  $text =~ s/\s+$//s;
-
-  print "\n## ".("-" x 70)."\n";
-  print "## Promotable rule: $name\n";
-  printf "## so=%5.3f   spc=%5.3f   hpc=%5.3f\n",
-            $pent->{so}, $pent->{spc}, $pent->{hpc};
-  print "## $rules->{$name}->{srcfile}\n";
-  print "## $pent->{detailhref}\n\n";
-  print $text,"\n";
-}
-
-sub rule_file_compile_core {
-  my ($f, $t, $filename) = @_;
-
-  return if $opt_listpromotable;
-
-  my $pubfile = $opt_out.'/'.$filename;
-  $output_files->{$pubfile} = 1;
-
-  open (IN, "<$f") or die "cannot read $f";
-  while (<IN>) {
-    my $orig = $_;
-
-    s/^#reuse/reuse/;   # TODO - dirty hack.  we need to fix this to just be
-    # a keyword which the engine ignores, this is absurd! 
-
-    s/#.*$//g; s/^\s+//; s/\s+$//;
-
-    # always publish non-sandbox lines verbatim.  just note what
-    # rules we've seen, and carry on
-    $output_file_text->{$pubfile} .= $orig;
-
-    # save "lang" declarations
-    my $lang = '';
-    if (s/^lang\s+(\S+)\s+//) {
-      $lang = $1;
-    }
-
-    if (/^
-        (header|rawbody|body|full|uri|meta|mimeheader|describe|
-        tflags|reuse|score)
-        \s+(\S+)\s+(.*)$
-      /x)
-    {
-      # rule definitions
-      my $type = $1;
-      my $name = $2;
-      my $val = $3;
+sub pubfile_for_rule {
+  my ($rules, $name) = @_;
 
-      # just save the name, and ignore the rest; we're already publishing it
-      $seen_rules->{$name} = 1;
+  my $pubfile;
+  if ($rules->{$name}->{publish}) {
+    $pubfile = $rules->{$name}->{pubfile};
+    if ($pubfile) {
+      $pubfile = $opt_out.'/'.$pubfile;
     }
-    elsif (/^
-        (pubfile|publish)
-        \s+(\S+)\s*(.*?)$
-      /x)
-    {
-      # preprocessor directives
-      my $command = $1;
-      my $name = $2;
-      my $val = $3;
+  }
 
-      warn "$f: WARNING: cannot use 'publish' in non-sandbox files\n";
+  # default: "70_sandbox.cf" or "72_active.cf"
+  if (!$pubfile) {
+    if ($active_rules->{$name}) {
+      $pubfile = $opt_out.'/'.$opt_activeout;
+    } else {
+      $pubfile = $opt_out.'/'.$opt_sandboxout;
     }
   }
-  close IN;
-
-  # now append all the found text to the output file buffers
-  print "$f: all lines copied\n";
+  return $pubfile;
 }
 
 sub plugin_file_compile {
@@ -570,6 +502,88 @@
   }
 }
 
+###########################################################################
+
+sub compile_output_files {
+  # create all known output files
+  foreach my $file (keys %$output_files) {
+    my $always_publish_text = $rules->{$ALWAYS_PUBLISH}->{output_text};
+
+    $output_file_text->{$file} = $output_files->{$file}->{header}.
+        $always_publish_text;
+  }
+
+  my @rulenames = sort keys %$rules;
+  my %seen = ();
+
+  # go through the rules looking for meta subrules we
+  # may have forgotten; this happens if a non-subrule is
+  # listed in active.list, the subrules will not be!  fix them
+  # to appear in the same output file as the master rule.
+  foreach my $rule (@rulenames) {
+    fix_up_rule_dependencies($rule);
+  }
+
+  # now repeat, just for rules in the active set; their dependencies should
+  # always be likewise promoted into the active set, overriding the prev step.
+  foreach my $rule (@rulenames) {
+    my $pubfile = $rules->{$rule}->{output_file};
+    next unless ($pubfile && $pubfile =~ /\Q$opt_activeout\E/);
+    fix_up_rule_dependencies($rule);
+  }
+
+  # output the known rules that are not meta subrules.
+  foreach my $rule (@rulenames) {
+    next if ($rule =~ /^__/);
+    my $pubfile = $rules->{$rule}->{output_file};
+    my $text    = $rules->{$rule}->{output_text};
+    next unless defined ($text);
+    $output_file_text->{$pubfile} .= "## ".$rule."\n".$text."\n";
+  }
+
+  # now output all subrules (in a slightly more compact form)
+  foreach my $rule (@rulenames) {
+    next unless ($rule =~ /^__/);
+    my $pubfile = $rules->{$rule}->{output_file};
+    my $text    = $rules->{$rule}->{output_text};
+    next unless defined ($text);
+    $output_file_text->{$pubfile} .= $text;
+  }
+
+  # finally, finish off all output files
+  foreach my $file (keys %$output_files) {
+    # do we have any end-of-file comments?  if so, add it
+    # off: results in comments being duplicated many times
+    # my $cmts = $rules->{$COMMENTS}->{text};
+    # if ($cmts) {
+      # $output_file_text->{$pubfile} .= $cmts;
+    # }
+
+    # and get them lint-checked!
+    $files_to_lint->{$file} = 1;
+  }
+}
+
+sub fix_up_rule_dependencies {
+  my $rule = shift;
+
+  # next if ($rule =~ /^__/);
+  my $pubfile = $rules->{$rule}->{output_file};
+  my $text    = $rules->{$rule}->{output_text};
+  return unless $text;
+  
+  while ($text =~ /^\s*meta\s+(.*)$/mg) {
+    my $line = $1;
+    while ($line =~ /\b([_A-Za-z0-9]+)\b/g) {
+      # force that subrule (if it exists) to output in the
+      # same pubfile
+      my $rule2 = $1;
+      next unless ($rules->{$rule2} && $rules->{$rule2}->{output_text});
+      $rules->{$rule2}->{output_file} = $pubfile;
+    }
+  }
+}
+
 sub write_output_files {
   foreach my $pubfile (sort keys %$output_files) {
     if (-f $pubfile) {
@@ -593,6 +607,8 @@
   }
 }
 
+###########################################################################
+
 sub rule_entry_create {
   return {
     text => '',
@@ -600,6 +616,8 @@
   };
 }
 
+###########################################################################
+
 sub sandbox_rule_name_avoid_collisions {
   my ($rule, $path) = @_;
   my $new;
@@ -640,11 +658,7 @@
   }
 }
 
-sub output_file_header {
-  my ($filename) = @_;
-  my $now = scalar localtime time;
-  return "# [compiled from '$filename' on $now]\n";
-}
+###########################################################################
 
 sub read_manifest {
   my ($mfest) = @_;
@@ -666,6 +680,16 @@
   close IN;
 }
 
+sub read_active {
+  my ($fname) = @_;
+  open (IN, "<$fname") or die "cannot read $fname";
+  while (<IN>) {
+    s/#.*$//; next if /^\s*$/;
+    /^(\S+)/ and $active_rules->{$1} = 1;
+  }
+  close IN;
+}
+
 sub filename_in_manifest {
   my ($fname) = @_;
   return 1 if ($file_manifest->{$fname});
@@ -676,4 +700,12 @@
   return 0;
 }
 
+
+__DATA__
+
+TODO list for this script:
+
+- license blocks at the top of each rulesrc/*/*.cf file need to be
+  ignored when compiling, instead of being duplicated into the
+  compiled output files.
 

Modified: spamassassin/trunk/masses/parse-rules-for-masses
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/parse-rules-for-masses?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/masses/parse-rules-for-masses (original)
+++ spamassassin/trunk/masses/parse-rules-for-masses Tue Dec 13 16:41:01 2005
@@ -76,7 +76,7 @@
     my %langs = ();
     foreach $file (sort @files) {
       if ($skip_test_rules) {
-	next if ($file =~ /7\d_/);
+	next if ($file =~ /70_/);
       }
       open (IN, "<$file");
       while (<IN>)

Modified: spamassassin/trunk/rules/20_html_tests.cf
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/20_html_tests.cf?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/rules/20_html_tests.cf (original)
+++ spamassassin/trunk/rules/20_html_tests.cf Tue Dec 13 16:41:01 2005
@@ -347,3 +347,7 @@
 # bug 3070
 rawbody HTML_TINY_FONT	/\<.*font\-size\:[ \"]*[01][^0-9]+.*\>/i
 describe HTML_TINY_FONT	body contains 1 or 0-point font
+
+body __HIGHBITS                     /(?:[\x80-\xff].?){4}/
+# note: __HIGHBITS is used by HTML_CHARSET_FARAWAY
+

Modified: spamassassin/trunk/rules/active.list
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/active.list?rev=356657&r1=356656&r2=356657&view=diff
==============================================================================
--- spamassassin/trunk/rules/active.list (original)
+++ spamassassin/trunk/rules/active.list Tue Dec 13 16:41:01 2005
@@ -1,4 +1,4 @@
-# active ruleset list generated on Mon Dec 12 17:12:08 2005
+# active ruleset list generated on Tue Dec 13 16:38:48 2005
 
 # spam=2.8371 ham=0.0690 so=0.976
 ADVANCE_FEE_2
@@ -15,9 +15,18 @@
 # spam=0.0000 ham=0.3866 so=0.000
 ALL_TRUSTED
 
+# tflags userconf
+AWL
+
 # spam=1.3418 ham=0.0050 so=0.996
 BODY_ENHANCEMENT2
 
+# tflags userconf
+CHARSET_FARAWAY
+
+# tflags userconf
+CHARSET_FARAWAY_HEADER
+
 # spam=0.8825 ham=0.0138 so=0.985
 DATE_IN_FUTURE_03_06
 
@@ -75,6 +84,9 @@
 # spam=0.3132 ham=0.0000 so=1.000
 EM_ROLEX
 
+# tflags userconf
+ENV_AND_HDR_SPF_MATCH
+
 # spam=0.2770 ham=0.0000 so=1.000
 FAKE_HELO_MAIL_COM_DOM
 
@@ -189,6 +201,33 @@
 # spam=0.2923 ham=0.0063 so=0.979
 GAPPY_SUBJECT
 
+# tflags userconf
+GTUBE
+
+# tflags userconf
+HASHCASH_20
+
+# tflags userconf
+HASHCASH_21
+
+# tflags userconf
+HASHCASH_22
+
+# tflags userconf
+HASHCASH_23
+
+# tflags userconf
+HASHCASH_24
+
+# tflags userconf
+HASHCASH_25
+
+# tflags userconf
+HASHCASH_2SPEND
+
+# tflags userconf
+HASHCASH_HIGH
+
 # spam=0.6102 ham=0.0000 so=1.000
 HEADER_SPAM
 
@@ -216,6 +255,9 @@
 # spam=0.7028 ham=0.0038 so=0.995
 HG_HORMONE
 
+# tflags userconf
+HTML_CHARSET_FARAWAY
+
 # spam=2.7616 ham=0.0000 so=1.000
 HTTP_77
 
@@ -255,6 +297,9 @@
 # spam=0.2786 ham=0.0013 so=0.996
 MIME_BOUND_MANY_HEX
 
+# tflags userconf
+MIME_CHARSET_FARAWAY
+
 # spam=1.2325 ham=0.0615 so=0.952
 MIME_HTML_MOSTLY
 
@@ -300,6 +345,12 @@
 # spam=1.4883 ham=0.0025 so=0.998
 NO_PRESCRIPTION
 
+# tflags userconf
+NO_RECEIVED
+
+# tflags userconf
+NO_RELAYS
+
 # spam=0.3899 ham=0.0025 so=0.994
 NUMERIC_HTTP_ADDR
 
@@ -384,6 +435,12 @@
 # spam=1.4109 ham=0.0100 so=0.993
 SORTED_RECIPS
 
+# tflags userconf
+SPF_HELO_PASS
+
+# tflags userconf
+SPF_PASS
+
 # spam=0.3183 ham=0.0075 so=0.977
 SPOOF_OURI
 
@@ -405,6 +462,15 @@
 # spam=0.3877 ham=0.0000 so=1.000
 SUBJECT_FUZZY_PENIS
 
+# tflags userconf
+SUBJECT_IN_BLACKLIST
+
+# tflags userconf
+SUBJECT_IN_WHITELIST
+
+# spam=21.3632 ham=0.0063 so=1.000
+SUBJ_ILLEGAL_CHARS
+
 # spam=0.4457 ham=0.0000 so=1.000
 T_DRUGS_HDIA
 
@@ -462,6 +528,9 @@
 # spam=0.5966 ham=0.0314 so=0.950
 UNIQUE_WORDS
 
+# tflags userconf
+UNPARSEABLE_RELAY
+
 # spam=0.5256 ham=0.0000 so=1.000
 UNRESOLVED_TEMPLATE
 
@@ -476,6 +545,33 @@
 
 # spam=1.9425 ham=0.0013 so=0.999
 URI_NO_WWW_INFO_CGI
+
+# tflags userconf
+USER_IN_ALL_SPAM_TO
+
+# tflags userconf
+USER_IN_BLACKLIST
+
+# tflags userconf
+USER_IN_BLACKLIST_TO
+
+# tflags userconf
+USER_IN_DEF_SPF_WL
+
+# tflags userconf
+USER_IN_DEF_WHITELIST
+
+# tflags userconf
+USER_IN_MORE_SPAM_TO
+
+# tflags userconf
+USER_IN_SPF_WHITELIST
+
+# tflags userconf
+USER_IN_WHITELIST
+
+# tflags userconf
+USER_IN_WHITELIST_TO
 
 # spam=1.5009 ham=0.0653 so=0.958
 US_DOLLARS_3