You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by hs...@apache.org on 2005/07/22 23:01:32 UTC
svn commit: r224398 -
/spamassassin/trunk/masses/evolve_metarule/evolve_metarule.c
Author: hstern
Date: Fri Jul 22 14:01:29 2005
New Revision: 224398
URL: http://svn.apache.org/viewcvs?rev=224398&view=rev
Log:
* evolve_metarule.c
The fitness function was not very effective with large rule sets so I
changed it a bit. Now you can give a range to say that solutions with
target +/- flex rules are half as good as those with target rules. I added
an option, -l, to control this.
Modified:
spamassassin/trunk/masses/evolve_metarule/evolve_metarule.c
Modified: spamassassin/trunk/masses/evolve_metarule/evolve_metarule.c
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/evolve_metarule/evolve_metarule.c?rev=224398&r1=224397&r2=224398&view=diff
==============================================================================
--- spamassassin/trunk/masses/evolve_metarule/evolve_metarule.c (original)
+++ spamassassin/trunk/masses/evolve_metarule/evolve_metarule.c Fri Jul 22 14:01:29 2005
@@ -35,6 +35,7 @@
/* Fitness function parameters */
int maximum_relevant_hits = 4; /* How many hits is the rule going to look for. */
int target_num_rules = 50; /* How many sub-rules would we like the meta rule to use? */
+double target_flex_rules = 5; /* How flexible the GA should be. Half-life of the fitness function. */
/* The fitness function is based on:
* min(num_hits, maximum_relevant_hits)^hits_exponent * count * ...
@@ -111,7 +112,7 @@
* sum_{all individuals}
* min(num_hits, maximum_relevant_hits)^hits_exponent * count * ...
* (if ham: -num_hits ^ penalty_exponent)
- * / exp(abs(target_num_rules - num_rules_present) / exp(1))
+ * / exp(abs(target_num_rules - num_rules_present) * log(2) / target_flex_rules)
* */
static boolean pattern_score(population *pop, entity *entity) {
int i, j, num_hits, num_rules_present;
@@ -152,7 +153,7 @@
/* This divisor is bound to 1, to prevent overflow. exp(0) is undefined
* so we just skip this part (it's unnecessary). */
if ( target_num_rules - num_rules_present ) {
- entity->fitness /= max(exp(fabs(target_num_rules - num_rules_present) / exp(1)),1);
+ entity->fitness /= max(exp(fabs(target_num_rules - num_rules_present) * log(2) / target_flex_rules),1);
}
/* Negative fitnesses make roulette wheels go owwie. */
@@ -189,8 +190,8 @@
}
}
- printf ("fitness: %f\n", entity->fitness);
- printf ("rule count: %d\n", count);
+ fprintf (stderr, "fitness: %f\n", entity->fitness);
+ fprintf (stderr, "rule count: %d\n", count);
/* Zero the histogram, just in case the compiler han't done it for us. */
bzero (histogram, sizeof(histogram));
@@ -214,14 +215,14 @@
}
/* Print the histogram. */
- printf ("\t %8s %8s %8s %8s %8s\n",
+ fprintf (stderr, "\t %8s %8s %8s %8s %8s\n",
"HAM",
"HAM%",
"SPAM",
"SPAM%",
"S/O");
for (i = 0; i <= maximum_relevant_hits; i++) {
- printf (">=%d hits:%8d %8.4f %8d %8.4f %8.4f\n", i,
+ fprintf (stderr, ">=%d hits:%8d %8.4f %8d %8.4f %8.4f\n", i,
histogram[0][i],
100.0 * histogram[0][i] / histogram[0][0],
histogram[1][i],
@@ -239,6 +240,7 @@
"\nFitness function parameters:\n"
" -m maximum_relevant_hits\n"
" -t target_num_rules\n"
+ " -l target_flex_rules\n"
" -e hits_exponent\n"
" -p penalty_exponent\n"
"\nGA parameters:\n"
@@ -256,7 +258,7 @@
population *pop = 0;
char arg;
- while ((arg = getopt (argc, argv, "h:r:m:t:e:p:s:g:x:u:?")) != -1) {
+ while ((arg = getopt (argc, argv, "h:r:m:t:l:e:p:s:g:x:u:?")) != -1) {
switch (arg) {
case 'h':
hits_file = optarg;
@@ -269,6 +271,9 @@
break;
case 't':
target_num_rules = atoi(optarg);
+ break;
+ case 'l':
+ target_flex_rules = atoi(optarg);
break;
case 'e':
hits_exponent = atof(optarg);