You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/27 16:04:12 UTC

svn commit: r560237 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Author: jm
Date: Fri Jul 27 07:04:10 2007
New Revision: 560237

URL: http://svn.apache.org/viewvc?view=rev&rev=560237
Log:
seek-phrases now understands paragraph breaks

Modified:
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?view=diff&rev=560237&r1=560236&r2=560237
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Fri Jul 27 07:04:10 2007
@@ -121,31 +121,33 @@
 
   (($msgcount % 1000) == 999) and discard_hapaxes();
 
-  my $w1 = '';
-  my $w2 = '';
-  my $w3 = '';
-
   my %tokens = ();
-  foreach my $w (split(' ', $text)) {
-    # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }
-
-    $w3 = $w2;
-    $w2 = $w1;
+  foreach my $line (split(/\[p\]/, $text)) {
+    my $w1 = '';
+    my $w2 = '';
+    my $w3 = '';
+
+    foreach my $w (split(' ', $line)) {
+      # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }
+
+      $w3 = $w2;
+      $w2 = $w1;
+
+      $w1 = $word2sym{$w};
+      if (!$w1) {
+        $word2sym{$w} = $w1 = $sym_acc;
+        $sym2word{$sym_acc} = $w;
+        $sym_acc++;
+      }
 
-    $w1 = $word2sym{$w};
-    if (!$w1) {
-      $word2sym{$w} = $w1 = $sym_acc;
-      $sym2word{$sym_acc} = $w;
-      $sym_acc++;
+      # simple bayesian N-grams to start
+      $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
+    }
+    
+    # deal with leftovers
+    if ($w2 && $w1) {
+      $tokens{"$w2.$w1"} = 1;
     }
-
-    # simple bayesian N-grams to start
-    $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
-  }
-  
-  # deal with leftovers
-  if ($w2 && $w1) {
-    $tokens{"$w2.$w1"} = 1;
   }
 
   foreach my $tok (keys %tokens) {
@@ -349,7 +351,7 @@
       last if ($l > $pat_maxlen);     # too long
 
       my $found;
-      if ($s =~ /(.)\Q$pat\E/s) { $found = $1; }
+      if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; }
 
       if (!defined $found) {
         # start of string.  break
@@ -366,7 +368,10 @@
 
     # expand towards end of string
     while (1) {
-      if (length($pat) > $pat_maxlen || $s !~ /\Q$pat\E(.)/s) {
+      if (length($pat) > $pat_maxlen
+         || $s =~ /\Q$pat\E\[p\]/s
+         || $s !~ /\Q$pat\E(.)/s)
+      {
         # end of string.  break
         last;
       }