You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by vi...@apache.org on 2005/12/12 15:26:51 UTC
svn commit: r356257 - in /james/server/trunk/src/java/org/apache/james:
transport/mailets/BayesianAnalysis.java util/BayesianAnalyzer.java
util/JDBCBayesianAnalyzer.java
Author: vincenzo
Date: Mon Dec 12 06:26:29 2005
New Revision: 356257
URL: http://svn.apache.org/viewcvs?rev=356257&view=rev
Log:
1) Fixed JAMES-387 (java.lang.ClassCastException: java.lang.Integer).
2) Some enhancements to reduce memory footprint.
Modified:
james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java
Modified: james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java?rev=356257&r1=356256&r2=356257&view=diff
==============================================================================
--- james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java (original)
+++ james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java Mon Dec 12 06:26:29 2005
@@ -340,8 +340,10 @@
try {
// this is synchronized to avoid concurrent update of the corpus
synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
+ analyzer.tokenCountsClear();
analyzer.loadHamNSpam(conn);
analyzer.buildCorpus();
+ analyzer.tokenCountsClear();
}
log("BayesianAnalysis Corpus loaded");
Modified: james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff
==============================================================================
--- james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java (original)
+++ james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java Mon Dec 12 06:26:29 2005
@@ -261,14 +261,21 @@
public void clear() {
corpus.clear();
- hamTokenCounts.clear();
- spamTokenCounts.clear();
+ tokenCountsClear();
hamMessageCount = 0;
spamMessageCount = 0;
}
/**
+ * Clears token counters.
+ */
+ public void tokenCountsClear() {
+ hamTokenCounts.clear();
+ spamTokenCounts.clear();
+ }
+
+ /**
* Public setter for corpus.
*
* @param corpus The new corpus.
@@ -289,17 +296,19 @@
*/
public void buildCorpus() {
//Combine the known ham & spam tokens.
- corpus.putAll(hamTokenCounts);
- corpus.putAll(spamTokenCounts);
+ Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
+ set.addAll(hamTokenCounts.keySet());
+ set.addAll(spamTokenCounts.keySet());
+ Map tempCorpus = new HashMap(set.size());
//Iterate through all the tokens and compute their new
//individual probabilities.
- Iterator i = corpus.keySet().iterator();
+ Iterator i = set.iterator();
while (i.hasNext()) {
String token = (String) i.next();
-
- corpus.put(token, new Double(computeProbability(token)));
+ tempCorpus.put(token, new Double(computeProbability(token)));
}
+ setCorpus(tempCorpus);
}
/**
@@ -335,13 +344,17 @@
//Build a set of the tokens in the Stream.
Set tokens = parse(stream);
+ // Get the corpus to use in this run
+ // A new corpus may be being built in the meantime
+ Map workCorpus = getCorpus();
+
//Assign their probabilities from the Corpus (using an additional
//calculation to determine spamminess).
- SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens);
+ SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
//Compute and return the overall probability that the
//stream is SPAM.
- return computeOverallProbability(tokenProbabilityStrengths);
+ return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
}
/**
@@ -575,9 +588,10 @@
* The ordering is from the highest strength to the lowest strength.
*
* @param tokens
+ * @param workCorpus
* @return SortedSet of TokenProbabilityStrength objects.
*/
- private SortedSet getTokenProbabilityStrengths(Set tokens) {
+ private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
//Convert to a SortedSet of token probability strengths.
SortedSet tokenProbabilityStrengths = new TreeSet();
@@ -587,14 +601,15 @@
tps.token = (String) i.next();
- if (corpus.containsKey(tps.token)) {
- tps.strength = Math.abs(0.5 - ((Double) corpus.get(tps.token)).doubleValue());
+ if (workCorpus.containsKey(tps.token)) {
+ tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
}
else {
//This token has never been seen before,
//we'll give it initially the default probability.
Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
+ boolean isTokenDegeneratedFound = false;
Collection degeneratedTokens = buildDegenerated(tps.token);
Iterator iDegenerated = degeneratedTokens.iterator();
@@ -602,17 +617,21 @@
double strengthDegenerated;
while (iDegenerated.hasNext()) {
tokenDegenerated = (String) iDegenerated.next();
- if (corpus.containsKey(tokenDegenerated)) {
- Double probabilityTemp = (Double) corpus.get(tokenDegenerated);
+ if (workCorpus.containsKey(tokenDegenerated)) {
+ Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
if (strengthDegenerated > tps.strength) {
+ isTokenDegeneratedFound = true;
tps.strength = strengthDegenerated;
corpusProbability = probabilityTemp;
}
}
}
- synchronized(corpus) {
- corpus.put(tps.token, corpusProbability);
+ // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
+ if (isTokenDegeneratedFound) {
+ synchronized(workCorpus) {
+ workCorpus.put(tps.token, corpusProbability);
+ }
}
}
@@ -672,9 +691,10 @@
* the tokenProbabilities SortedSet.
*
* @param tokenProbabilities
+ * @param workCorpus
* @return Computed spamminess.
*/
- private double computeOverallProbability(SortedSet tokenProbabilityStrengths) {
+ private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
double p = 1.0;
double np = 1.0;
double tempStrength = 0.5;
@@ -686,9 +706,15 @@
// System.out.println(tps);
- p *= ((Double) corpus.get(tps.token)).doubleValue();
- np *= (1.0 - ((Double) corpus.get(tps.token)).doubleValue());
- // System.out.println("Token:" + tps.token + ", p=" + ((Double) corpus.get(tps.token)).doubleValue() + ", overall p=" + p / (p + np));
+ double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
+ Double theDoubleObject = (Double) workCorpus.get(tps.token);
+ // if either the original token or a degeneration was found use the double value, otherwise use the default
+ if (theDoubleObject != null) {
+ theDoubleValue = theDoubleObject.doubleValue();
+ }
+ p *= theDoubleValue;
+ np *= (1.0 - theDoubleValue);
+ // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
}
return (p / (p + np));
Modified: james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java
URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff
==============================================================================
--- james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java (original)
+++ james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java Mon Dec 12 06:26:29 2005
@@ -162,7 +162,12 @@
Map ham = getHamTokenCounts();
while (rs.next()) {
- ham.put(rs.getString(1), new Integer(rs.getInt(2)));
+ String token = rs.getString(1);
+ int count = rs.getInt(2);
+ // to reduce memory, use the token only if the count is > 1
+ if (count > 1) {
+ ham.put(token, new Integer(count));
+ }
}
//Verbose.
delegatedLog("Ham tokens count: " + ham.size());
@@ -176,7 +181,12 @@
Map spam = getSpamTokenCounts();
while (rs.next()) {
- spam.put(rs.getString(1), new Integer(rs.getInt(2)));
+ String token = rs.getString(1);
+ int count = rs.getInt(2);
+ // to reduce memory, use the token only if the count is > 1
+ if (count > 1) {
+ spam.put(token, new Integer(count));
+ }
}
//Verbose.
---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org