You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/27 16:04:12 UTC
svn commit: r560237 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Author: jm
Date: Fri Jul 27 07:04:10 2007
New Revision: 560237
URL: http://svn.apache.org/viewvc?view=rev&rev=560237
Log:
seek-phrases now understands paragraph breaks
Modified:
spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?view=diff&rev=560237&r1=560236&r2=560237
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Fri Jul 27 07:04:10 2007
@@ -121,31 +121,33 @@
(($msgcount % 1000) == 999) and discard_hapaxes();
- my $w1 = '';
- my $w2 = '';
- my $w3 = '';
-
my %tokens = ();
- foreach my $w (split(' ', $text)) {
- # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }
-
- $w3 = $w2;
- $w2 = $w1;
+ foreach my $line (split(/\[p\]/, $text)) {
+ my $w1 = '';
+ my $w2 = '';
+ my $w3 = '';
+
+ foreach my $w (split(' ', $line)) {
+ # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }
+
+ $w3 = $w2;
+ $w2 = $w1;
+
+ $w1 = $word2sym{$w};
+ if (!$w1) {
+ $word2sym{$w} = $w1 = $sym_acc;
+ $sym2word{$sym_acc} = $w;
+ $sym_acc++;
+ }
- $w1 = $word2sym{$w};
- if (!$w1) {
- $word2sym{$w} = $w1 = $sym_acc;
- $sym2word{$sym_acc} = $w;
- $sym_acc++;
+ # simple bayesian N-grams to start
+ $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
+ }
+
+ # deal with leftovers
+ if ($w2 && $w1) {
+ $tokens{"$w2.$w1"} = 1;
}
-
- # simple bayesian N-grams to start
- $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
- }
-
- # deal with leftovers
- if ($w2 && $w1) {
- $tokens{"$w2.$w1"} = 1;
}
foreach my $tok (keys %tokens) {
@@ -349,7 +351,7 @@
last if ($l > $pat_maxlen); # too long
my $found;
- if ($s =~ /(.)\Q$pat\E/s) { $found = $1; }
+ if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; }
if (!defined $found) {
# start of string. break
@@ -366,7 +368,10 @@
# expand towards end of string
while (1) {
- if (length($pat) > $pat_maxlen || $s !~ /\Q$pat\E(.)/s) {
+ if (length($pat) > $pat_maxlen
+ || $s =~ /\Q$pat\E\[p\]/s
+ || $s !~ /\Q$pat\E(.)/s)
+ {
# end of string. break
last;
}