You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by vi...@apache.org on 2005/12/12 15:26:51 UTC

svn commit: r356257 - in /james/server/trunk/src/java/org/apache/james: transport/mailets/BayesianAnalysis.java util/BayesianAnalyzer.java util/JDBCBayesianAnalyzer.java

Author: vincenzo
Date: Mon Dec 12 06:26:29 2005
New Revision: 356257

URL: http://svn.apache.org/viewcvs?rev=356257&view=rev
Log:
1) Fixed JAMES-387 (java.lang.ClassCastException: java.lang.Integer).
2) Some enhancements to reduce memory footprint.

Modified:
    james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
    james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
    james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java

Modified: james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java?rev=356257&r1=356256&r2=356257&view=diff
==============================================================================
--- james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java (original)
+++ james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java Mon Dec 12 06:26:29 2005
@@ -340,8 +340,10 @@
         try {
             // this is synchronized to avoid concurrent update of the corpus
             synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
+                analyzer.tokenCountsClear();
                 analyzer.loadHamNSpam(conn);
                 analyzer.buildCorpus();
+                analyzer.tokenCountsClear();
             }
             
             log("BayesianAnalysis Corpus loaded");

Modified: james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff
==============================================================================
--- james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java (original)
+++ james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java Mon Dec 12 06:26:29 2005
@@ -261,14 +261,21 @@
     public void clear() {
         corpus.clear();
         
-        hamTokenCounts.clear();
-        spamTokenCounts.clear();
+        tokenCountsClear();
         
         hamMessageCount = 0;
         spamMessageCount = 0;
     }
     
     /**
+     * Clears token counters.
+     */
+    public void tokenCountsClear() {
+        hamTokenCounts.clear();
+        spamTokenCounts.clear();
+    }
+    
+    /**
      * Public setter for corpus.
      *
      * @param corpus The new corpus.
@@ -289,17 +296,19 @@
      */
     public void buildCorpus() {
         //Combine the known ham & spam tokens.
-        corpus.putAll(hamTokenCounts);
-        corpus.putAll(spamTokenCounts);
+        Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
+        set.addAll(hamTokenCounts.keySet());
+        set.addAll(spamTokenCounts.keySet());
+        Map tempCorpus = new HashMap(set.size());
         
         //Iterate through all the tokens and compute their new
         //individual probabilities.
-        Iterator i = corpus.keySet().iterator();
+        Iterator i = set.iterator();
         while (i.hasNext()) {
             String token = (String) i.next();
-            
-            corpus.put(token, new Double(computeProbability(token)));
+            tempCorpus.put(token, new Double(computeProbability(token)));
         }
+        setCorpus(tempCorpus);
     }
     
     /**
@@ -335,13 +344,17 @@
         //Build a set of the tokens in the Stream.
         Set tokens = parse(stream);
         
+        // Get the corpus to use in this run
+        // A new corpus may be being built in the meantime
+        Map workCorpus = getCorpus();
+        
         //Assign their probabilities from the Corpus (using an additional
         //calculation to determine spamminess).
-        SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens);
+        SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
         
         //Compute and return the overall probability that the
         //stream is SPAM.
-        return computeOverallProbability(tokenProbabilityStrengths);
+        return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
     }
     
     /**
@@ -575,9 +588,10 @@
      * The ordering is from the highest strength to the lowest strength.
      *
      * @param tokens
+     * @param workCorpus
      * @return  SortedSet of TokenProbabilityStrength objects.
      */
-    private SortedSet getTokenProbabilityStrengths(Set tokens) {
+    private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
         //Convert to a SortedSet of token probability strengths.
         SortedSet tokenProbabilityStrengths = new TreeSet();
         
@@ -587,14 +601,15 @@
             
             tps.token = (String) i.next();
             
-            if (corpus.containsKey(tps.token)) {
-                tps.strength = Math.abs(0.5 - ((Double) corpus.get(tps.token)).doubleValue());
+            if (workCorpus.containsKey(tps.token)) {
+                tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
             }
             else {
                 //This token has never been seen before,
                 //we'll give it initially the default probability.
                 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
                 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
+                boolean isTokenDegeneratedFound = false;
                 
                 Collection degeneratedTokens = buildDegenerated(tps.token);
                 Iterator iDegenerated = degeneratedTokens.iterator();
@@ -602,17 +617,21 @@
                 double strengthDegenerated;
                 while (iDegenerated.hasNext()) {
                     tokenDegenerated = (String) iDegenerated.next();
-                    if (corpus.containsKey(tokenDegenerated)) {
-                        Double probabilityTemp = (Double) corpus.get(tokenDegenerated);
+                    if (workCorpus.containsKey(tokenDegenerated)) {
+                        Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
                         strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
                         if (strengthDegenerated > tps.strength) {
+                            isTokenDegeneratedFound = true;
                             tps.strength = strengthDegenerated;
                             corpusProbability = probabilityTemp;
                         }
                     }
                 }
-                synchronized(corpus) {
-                    corpus.put(tps.token, corpusProbability);
+                // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
+                if (isTokenDegeneratedFound) {
+                    synchronized(workCorpus) {
+                        workCorpus.put(tps.token, corpusProbability);
+                    }
                 }
             }
             
@@ -672,9 +691,10 @@
      * the tokenProbabilities SortedSet.
      *
      * @param tokenProbabilities
+     * @param workCorpus
      * @return  Computed spamminess.
      */
-    private double computeOverallProbability(SortedSet tokenProbabilityStrengths) {
+    private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
         double p = 1.0;
         double np = 1.0;
         double tempStrength = 0.5;
@@ -686,9 +706,15 @@
             
             //      System.out.println(tps);
             
-            p *= ((Double) corpus.get(tps.token)).doubleValue();
-            np *= (1.0 - ((Double) corpus.get(tps.token)).doubleValue());
-            //      System.out.println("Token:" + tps.token + ", p=" + ((Double) corpus.get(tps.token)).doubleValue() + ", overall p=" + p / (p + np));
+            double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
+            Double theDoubleObject = (Double) workCorpus.get(tps.token);
+            // if either the original token or a degeneration was found use the double value, otherwise use the default
+            if (theDoubleObject != null) {
+                theDoubleValue = theDoubleObject.doubleValue();
+            }
+            p *= theDoubleValue;
+            np *= (1.0 - theDoubleValue);
+            // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
         }
         
         return (p / (p + np));

Modified: james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java
URL: http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff
==============================================================================
--- james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java (original)
+++ james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java Mon Dec 12 06:26:29 2005
@@ -162,7 +162,12 @@
             
             Map ham = getHamTokenCounts();
             while (rs.next()) {
-                ham.put(rs.getString(1), new Integer(rs.getInt(2)));
+                String token = rs.getString(1);
+                int count = rs.getInt(2);
+                // to reduce memory, use the token only if the count is > 1
+                if (count > 1) {
+                    ham.put(token, new Integer(count));
+                }
             }
             //Verbose.
             delegatedLog("Ham tokens count: " + ham.size());
@@ -176,7 +181,12 @@
             
             Map spam = getSpamTokenCounts();
             while (rs.next()) {
-                spam.put(rs.getString(1), new Integer(rs.getInt(2)));
+                String token = rs.getString(1);
+                int count = rs.getInt(2);
+                // to reduce memory, use the token only if the count is > 1
+                if (count > 1) {
+                    spam.put(token, new Integer(count));
+                }
             }
             
             //Verbose.



---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org