You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@james.apache.org by bt...@apache.org on 2021/07/06 01:27:06 UTC

[james-project] branch master updated: [REFACTORING] Remove unused BayesianAnalyzer and related class (#526)

This is an automated email from the ASF dual-hosted git repository.

btellier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git


The following commit(s) were added to refs/heads/master by this push:
     new 0b1d460  [REFACTORING] Remove unused BayesianAnalyzer and related class (#526)
0b1d460 is described below

commit 0b1d460c5e7a00ebf2eb85a2fcef4f736fc5e8c1
Author: Tellier Benoit <bt...@linagora.com>
AuthorDate: Tue Jul 6 08:26:56 2021 +0700

    [REFACTORING] Remove unused BayesianAnalyzer and related class (#526)
---
 .../spring-app/src/main/resources/sqlResources.xml | 136 ----
 .../james/util/bayesian/BayesianAnalyzer.java      | 714 ---------------------
 .../james/util/bayesian/JDBCBayesianAnalyzer.java  | 381 -----------
 3 files changed, 1231 deletions(-)

diff --git a/server/apps/spring-app/src/main/resources/sqlResources.xml b/server/apps/spring-app/src/main/resources/sqlResources.xml
index 286a994..07f7c7b 100644
--- a/server/apps/spring-app/src/main/resources/sqlResources.xml
+++ b/server/apps/spring-app/src/main/resources/sqlResources.xml
@@ -335,142 +335,6 @@
         </sql>
     </sqlDefs>
 
-
-<!-- SQL statements to support the BayesianAnalysis and the BayesianAnalysisFeeder mailets -->
-<!-- -->
-    <sqlDefs name="org.apache.james.util.bayesian.JDBCBayesianAnalyzer">
-
-        <sql name="hamTableName">bayesiananalysis_ham</sql>
-        <sql name="spamTableName">bayesiananalysis_spam</sql>
-        <sql name="messageCountsTableName">bayesiananalysis_messagecounts</sql>
-
-        <!-- Statements used to retrieve the message counts. -->
-        <sql name="selectMessageCounts">SELECT HAMCOUNT, SPAMCOUNT FROM bayesiananalysis_messagecounts</sql>
-
-        <!-- Statements used to initialize the message counts. -->
-        <sql name="initializeMessageCounts">INSERT INTO bayesiananalysis_messagecounts (HAMCOUNT, SPAMCOUNT) VALUES (0,0)</sql>
-
-        <!-- Statements used to update the ham message counts. -->
-        <sql name="updateHamMessageCounts">UPDATE bayesiananalysis_messagecounts SET HAMCOUNT=(HAMCOUNT + ?)</sql>
-
-        <!-- Statements used to update the spam message counts. -->
-        <sql name="updateSpamMessageCounts">UPDATE bayesiananalysis_messagecounts SET SPAMCOUNT=(SPAMCOUNT + ?)</sql>
-
-        <!-- Statements used to retrieve the ham token counts. -->
-        <sql name="selectHamTokens">SELECT TOKEN, OCCURRENCES FROM bayesiananalysis_ham</sql>
-
-        <!-- Statements used to retrieve the spam token counts. -->
-        <sql name="selectSpamTokens">SELECT TOKEN, OCCURRENCES FROM bayesiananalysis_spam</sql>
-
-        <!-- Statements used to insert the ham token counts. -->
-        <sql name="insertHamToken">INSERT INTO bayesiananalysis_ham (TOKEN, OCCURRENCES) VALUES (?,?)</sql>
-
-        <!-- Statements used to insert the spam token counts. -->
-        <sql name="insertSpamToken">INSERT INTO bayesiananalysis_spam (TOKEN, OCCURRENCES) VALUES (?,?)</sql>
-
-        <!-- Statements used to update the ham token counts. -->
-        <sql name="updateHamToken">UPDATE bayesiananalysis_ham SET OCCURRENCES=(OCCURRENCES + ?) WHERE (TOKEN=?)</sql>
-
-        <!-- Statements used to update the spam token counts. -->
-        <sql name="updateSpamToken">UPDATE bayesiananalysis_spam SET OCCURRENCES=(OCCURRENCES + ?) WHERE (TOKEN=?)</sql>
-
-        <!-- Statements used to delete ham tokens. -->
-        <sql name="deleteHamTokens">DELETE FROM bayesiananalysis_ham</sql>
-
-        <!-- Statements used to delete spam tokens. -->
-        <sql name="deleteSpamTokens">DELETE FROM bayesiananalysis_spam</sql>
-
-        <!-- Statements used to delete message counts. -->
-        <sql name="deleteMessageCounts">DELETE FROM bayesiananalysis_messagecounts</sql>
-
-
-        <!-- Statements used to create the "ham" table (the 'token' field must be case sensitive). -->
-        <sql name="createHamTable" db="mysql">
-            CREATE TABLE bayesiananalysis_ham (
-            token varchar(128) binary NOT NULL default '',
-            occurrences int(11) NOT NULL default '0',
-            PRIMARY KEY (token)
-            ) TYPE=InnoDB
-        </sql>
-        <sql name="createHamTable" db="mssql">
-            CREATE TABLE [bayesiananalysis_ham] (
-            [token] [varchar] (128) COLLATE Latin1_General_CS_AS NOT NULL,
-            [occurrences] [int] NOT NULL default (0),
-            PRIMARY KEY (token)
-            )
-        </sql>
-        <sql name="createHamTable" db="derby">
-            CREATE TABLE bayesiananalysis_ham (
-            token varchar(128) NOT NULL,
-            occurrences INTEGER NOT NULL default 0,
-            PRIMARY KEY (token)
-            )
-        </sql>
-        <sql name="createHamTable" db="postgresql">
-            CREATE TABLE bayesiananalysis_ham (
-            token varchar(128) NOT NULL,
-            occurrences int NOT NULL default 0,
-            PRIMARY KEY (token)
-            )
-        </sql>
-
-        <!-- Statements used to create the "spam" table (the 'token' field must be case sensitive). -->
-        <sql name="createSpamTable" db="mysql">
-            CREATE TABLE bayesiananalysis_spam (
-            token varchar(128) binary NOT NULL default '',
-            occurrences int(11) NOT NULL default '0',
-            PRIMARY KEY (token)
-            ) TYPE=InnoDB
-        </sql>
-        <sql name="createSpamTable" db="mssql">
-            CREATE TABLE [bayesiananalysis_spam] (
-            [token] [varchar] (128) COLLATE Latin1_General_CS_AS NOT NULL,
-            [occurrences] [int] NOT NULL default (0),
-            PRIMARY KEY (token)
-            )
-        </sql>
-        <sql name="createSpamTable" db="derby">
-            CREATE TABLE bayesiananalysis_spam (
-            token varchar (128) NOT NULL,
-            occurrences INTEGER  NOT NULL default 0,
-            PRIMARY KEY (token)
-            )
-        </sql>
-        <sql name="createSpamTable" db="postgresql">
-            CREATE TABLE bayesiananalysis_spam (
-            token varchar (128) NOT NULL,
-            occurrences int  NOT NULL default 0,
-            PRIMARY KEY (token)
-            )
-        </sql>
-
-        <!-- Statements used to create the "message counts" table. -->
-        <sql name="createMessageCountsTable" db="mysql">
-            CREATE TABLE bayesiananalysis_messagecounts (
-            hamcount int(11) NOT NULL default '0',
-            spamcount int(11) NOT NULL default '0'
-            ) TYPE=InnoDB
-        </sql>
-        <sql name="createMessageCountsTable" db="mssql">
-            CREATE TABLE [bayesiananalysis_messagecounts] (
-            [hamcount] [int] NOT NULL default (0),
-            [spamcount] [int] NOT NULL default (0)
-            )
-        </sql>
-        <sql name="createMessageCountsTable" db="derby">
-            CREATE TABLE bayesiananalysis_messagecounts (
-            hamcount INTEGER NOT NULL default 0,
-            spamcount INTEGER  NOT NULL default 0
-            )
-        </sql>
-        <sql name="createMessageCountsTable" db="postgresql">
-            CREATE TABLE bayesiananalysis_messagecounts (
-            hamcount int NOT NULL default 0,
-            spamcount int  NOT NULL default 0
-            )
-        </sql>
-    </sqlDefs>
-
 <!-- SQL statements to support the WhiteListManager mailet and the IsInWhiteList matcher -->
 <!-- -->
     <sqlDefs name="WhiteList">
diff --git a/server/container/util/src/main/java/org/apache/james/util/bayesian/BayesianAnalyzer.java b/server/container/util/src/main/java/org/apache/james/util/bayesian/BayesianAnalyzer.java
deleted file mode 100644
index bf6391d..0000000
--- a/server/container/util/src/main/java/org/apache/james/util/bayesian/BayesianAnalyzer.java
+++ /dev/null
@@ -1,714 +0,0 @@
-/****************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one   *
- * or more contributor license agreements.  See the NOTICE file *
- * distributed with this work for additional information        *
- * regarding copyright ownership.  The ASF licenses this file   *
- * to you under the Apache License, Version 2.0 (the            *
- * "License"); you may not use this file except in compliance   *
- * with the License.  You may obtain a copy of the License at   *
- *                                                              *
- *   http://www.apache.org/licenses/LICENSE-2.0                 *
- *                                                              *
- * Unless required by applicable law or agreed to in writing,   *
- * software distributed under the License is distributed on an  *
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
- * KIND, either express or implied.  See the License for the    *
- * specific language governing permissions and limitations      *
- * under the License.                                           *
- ****************************************************************/
-
-package org.apache.james.util.bayesian;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.stream.IntStream;
-
-/**
- * <p>
- * Determines probability that text contains Spam.
- * </p>
- * <p/>
- * <p>
- * Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan
- * for Spam</a>. Extended to Paul Grahams' <a
- * href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.
- * </p>
- * <p/>
- * <p>
- * Sample method usage:
- * </p>
- * <p/>
- * <p>
- * Use: void addHam(Reader) and void addSpam(Reader)
- * <p/>
- * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam
- * and addSpam assume they're reading one message at a time, if you feed more
- * than one message per call, be sure to adjust the appropriate message counter:
- * hamMessageCount or spamMessageCount.
- * <p/>
- * Then...
- * </p>
- * <p/>
- * <p>
- * Use: void buildCorpus()
- * <p/>
- * to build the final token/probabilities Map.
- * <p/>
- * Use your own methods for persistent storage of either the individual ham/spam
- * corpus & message counts, and/or the final corpus.
- * <p/>
- * Then you can...
- * </p>
- * <p/>
- * <p>
- * Use: double computeSpamProbability(Reader)
- * <p/>
- * to determine the probability that a particular text contains spam. A returned
- * result of 0.9 or above is an indicator that the text was spam.
- * </p>
- * <p/>
- * <p>
- * If you use persistent storage, use: void setCorpus(Map)
- * <p/>
- * before calling computeSpamProbability.
- * </p>
- *
- * @since 2.3.0
- */
-
-public class BayesianAnalyzer {
-
-    private final Object lock = new Object();
-
-    /**
-     * Number of "interesting" tokens to use to compute overall spamminess
-     * probability.
-     */
-    private static final int MAX_INTERESTING_TOKENS = 15;
-
-    /**
-     * Minimum probability distance from 0.5 to consider a token "interesting"
-     * to use to compute overall spamminess probability.
-     */
-    private static final double INTERESTINGNESS_THRESHOLD = 0.46;
-
-    /**
-     * Default token probability to use when a token has not been encountered
-     * before.
-     */
-    private static final double DEFAULT_TOKEN_PROBABILITY = 0.4;
-
-    /**
-     * Map of ham tokens and their occurrences.
-     */
-    private Map<String, Integer> hamTokenCounts = new HashMap<>();
-
-    /**
-     * Map of spam tokens and their occurrences.
-     */
-    private Map<String, Integer> spamTokenCounts = new HashMap<>();
-
-    /**
-     * Number of ham messages analyzed.
-     */
-    private int hamMessageCount = 0;
-
-    /**
-     * Number of spam messages analyzed.
-     */
-    private int spamMessageCount = 0;
-
-    /**
-     * Final token/probability corpus.
-     */
-    private Map<String, Double> corpus = new HashMap<>();
-
-    /**
-     * Inner class for managing Token Probability Strengths during the
-     * computeSpamProbability phase.
-     * <p/>
-     * By probability <i>strength</i> we mean the absolute distance of a
-     * probability from the middle value 0.5.
-     * <p/>
-     * It implements Comparable so that it's sorting is automatic.
-     */
-    private static class TokenProbabilityStrength implements Comparable<TokenProbabilityStrength> {
-        /**
-         * Message token.
-         */
-        String token = null;
-
-        /**
-         * Token's computed probability strength.
-         */
-        double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
-
-        /**
-         * Force the natural sort order for this object to be high-to-low.
-         *
-         * @param anotherTokenProbabilityStrength
-         *         A TokenProbabilityStrength instance to compare this
-         *         instance with.
-         * @return The result of the comparison (before, equal, after).
-         */
-        @Override
-        public final int compareTo(TokenProbabilityStrength anotherTokenProbabilityStrength) {
-            int result = (int) ((anotherTokenProbabilityStrength.strength - strength) * 1000000);
-            if (result == 0) {
-                return this.token.compareTo(anotherTokenProbabilityStrength.token);
-            } else {
-                return result;
-            }
-        }
-
-        /**
-         * Simple toString () implementation mostly for debugging purposes.
-         *
-         * @return String representation of this object.
-         */
-        public String toString() {
-            StringBuilder sb = new StringBuilder(30);
-
-            sb.append(token).append("=").append(strength);
-
-            return sb.toString();
-        }
-    }
-
-    /**
-     * Basic class constructor.
-     */
-    public BayesianAnalyzer() {
-    }
-
-    /**
-     * Public setter for the hamTokenCounts Map.
-     *
-     * @param hamTokenCounts The new ham Token counts Map.
-     */
-    public void setHamTokenCounts(Map<String, Integer> hamTokenCounts) {
-        this.hamTokenCounts = hamTokenCounts;
-    }
-
-    /**
-     * Public getter for the hamTokenCounts Map.
-     */
-    public Map<String, Integer> getHamTokenCounts() {
-        return this.hamTokenCounts;
-    }
-
-    /**
-     * Public setter for the spamTokenCounts Map.
-     *
-     * @param spamTokenCounts The new spam Token counts Map.
-     */
-    public void setSpamTokenCounts(Map<String, Integer> spamTokenCounts) {
-        this.spamTokenCounts = spamTokenCounts;
-    }
-
-    /**
-     * Public getter for the spamTokenCounts Map.
-     */
-    public Map<String, Integer> getSpamTokenCounts() {
-        return this.spamTokenCounts;
-    }
-
-    /**
-     * Public setter for spamMessageCount.
-     *
-     * @param spamMessageCount The new spam message count.
-     */
-    public void setSpamMessageCount(int spamMessageCount) {
-        this.spamMessageCount = spamMessageCount;
-    }
-
-    /**
-     * Public getter for spamMessageCount.
-     */
-    public int getSpamMessageCount() {
-        return this.spamMessageCount;
-    }
-
-    /**
-     * Public setter for hamMessageCount.
-     *
-     * @param hamMessageCount The new ham message count.
-     */
-    public void setHamMessageCount(int hamMessageCount) {
-        this.hamMessageCount = hamMessageCount;
-    }
-
-    /**
-     * Public getter for hamMessageCount.
-     */
-    public int getHamMessageCount() {
-        return this.hamMessageCount;
-    }
-
-    /**
-     * Clears all analysis repositories and counters.
-     */
-    public void clear() {
-        corpus.clear();
-
-        tokenCountsClear();
-
-        hamMessageCount = 0;
-        spamMessageCount = 0;
-    }
-
-    /**
-     * Clears token counters.
-     */
-    public void tokenCountsClear() {
-        hamTokenCounts.clear();
-        spamTokenCounts.clear();
-    }
-
-    /**
-     * Public setter for corpus.
-     *
-     * @param corpus The new corpus.
-     */
-    public void setCorpus(Map<String, Double> corpus) {
-        this.corpus = corpus;
-    }
-
-    /**
-     * Public getter for corpus.
-     */
-    public Map<String, Double> getCorpus() {
-        return this.corpus;
-    }
-
-    /**
-     * Builds the corpus from the existing ham & spam counts.
-     */
-    public void buildCorpus() {
-        // Combine the known ham & spam tokens.
-        Set<String> set = new HashSet<>(hamTokenCounts.size() + spamTokenCounts.size());
-        set.addAll(hamTokenCounts.keySet());
-        set.addAll(spamTokenCounts.keySet());
-        Map<String, Double> tempCorpus = new HashMap<>(set.size());
-
-        // Iterate through all the tokens and compute their new
-        // individual probabilities.
-        for (String token : set) {
-            tempCorpus.put(token, computeProbability(token));
-        }
-        setCorpus(tempCorpus);
-    }
-
-    /**
-     * Adds a message to the ham list.
-     *
-     * @param stream A reader stream on the ham message to analyze
-     * @throws IOException If any error occurs
-     */
-    public void addHam(Reader stream) throws java.io.IOException {
-        addTokenOccurrences(stream, hamTokenCounts);
-        hamMessageCount++;
-    }
-
-    /**
-     * Adds a message to the spam list.
-     *
-     * @param stream A reader stream on the spam message to analyze
-     * @throws IOException If any error occurs
-     */
-    public void addSpam(Reader stream) throws java.io.IOException {
-        addTokenOccurrences(stream, spamTokenCounts);
-        spamMessageCount++;
-    }
-
-    /**
-     * Computes the probability that the stream contains SPAM.
-     *
-     * @param stream The text to be analyzed for Spamminess.
-     * @return A 0.0 - 1.0 probability
-     * @throws IOException If any error occurs
-     */
-    public double computeSpamProbability(Reader stream) throws java.io.IOException {
-        // Build a set of the tokens in the Stream.
-        Set<String> tokens = parse(stream);
-
-        // Get the corpus to use in this run
-        // A new corpus may be being built in the meantime
-        Map<String, Double> workCorpus = getCorpus();
-
-        // Assign their probabilities from the Corpus (using an additional
-        // calculation to determine spamminess).
-        SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
-
-        // Compute and return the overall probability that the
-        // stream is SPAM.
-        return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
-    }
-
-    /**
-     * Parses a stream into tokens, and updates the target Map with the
-     * token/counts.
-     *
-     * @param stream
-     * @param target
-     */
-    private void addTokenOccurrences(Reader stream, Map<String, Integer> target) throws java.io.IOException {
-        String token;
-        String header = "";
-
-        // Update target with the tokens/count encountered.
-        while ((token = nextToken(stream)) != null) {
-            boolean endingLine = false;
-            if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
-                endingLine = true;
-                token = token.substring(0, token.length() - 1);
-            }
-
-            if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
-                if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {
-                    header = token;
-                    if (!endingLine) {
-                        continue;
-                    }
-                }
-
-                token = header + token;
-
-                Integer value;
-
-                if (target.containsKey(token)) {
-                    value = target.get(token) + 1;
-                } else {
-                    value = 1;
-                }
-
-                target.put(token, value);
-            }
-
-            if (endingLine) {
-                header = "";
-            }
-        }
-    }
-
-    /**
-     * Parses a stream into tokens, and returns a Set of the unique tokens
-     * encountered.
-     *
-     * @param stream
-     * @return Set
-     */
-    private Set<String> parse(Reader stream) throws java.io.IOException {
-        Set<String> tokens = new HashSet<>();
-        String token;
-        String header = "";
-
-        // Build a Map of tokens encountered.
-        while ((token = nextToken(stream)) != null) {
-            boolean endingLine = false;
-            if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
-                endingLine = true;
-                token = token.substring(0, token.length() - 1);
-            }
-
-            if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
-                if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {
-                    header = token;
-                    if (!endingLine) {
-                        continue;
-                    }
-                }
-
-                token = header + token;
-
-                tokens.add(token);
-            }
-
-            if (endingLine) {
-                header = "";
-            }
-        }
-
-        // Return the unique set of tokens encountered.
-        return tokens;
-    }
-
-    private String nextToken(Reader reader) throws java.io.IOException {
-        StringBuilder token = new StringBuilder();
-        int i;
-        char ch;
-        char ch2;
-        boolean previousWasDigit = false;
-        boolean tokenCharFound = false;
-
-        if (!reader.ready()) {
-            return null;
-        }
-
-        while ((i = reader.read()) != -1) {
-
-            ch = (char) i;
-
-            if (ch == ':') {
-                String tokenString = token.toString() + ':';
-                if (tokenString.equals("From:") || tokenString.equals("Return-Path:") || tokenString.equals("Subject:") || tokenString.equals("To:")) {
-                    return tokenString;
-                }
-            }
-
-            if (Character.isLetter(ch) || ch == '-' || ch == '$' || ch == '€'
-                    || ch == '!' || ch == '\'') {
-                tokenCharFound = true;
-                previousWasDigit = false;
-                token.append(ch);
-            } else if (Character.isDigit(ch)) {
-                tokenCharFound = true;
-                previousWasDigit = true;
-                token.append(ch);
-            } else if (previousWasDigit && (ch == '.' || ch == ',')) {
-                reader.mark(1);
-                previousWasDigit = false;
-                i = reader.read();
-                if (i == -1) {
-                    break;
-                }
-                ch2 = (char) i;
-                if (Character.isDigit(ch2)) {
-                    tokenCharFound = true;
-                    previousWasDigit = true;
-                    token.append(ch);
-                    token.append(ch2);
-                } else {
-                    reader.reset();
-                    break;
-                }
-            } else if (ch == '\r') {
-                // cr found, ignore
-            } else if (ch == '\n') {
-                // eol found
-                tokenCharFound = true;
-                previousWasDigit = false;
-                token.append(ch);
-                break;
-            } else if (tokenCharFound) {
-                break;
-            }
-        }
-
-        if (tokenCharFound) {
-            // System.out.println("Token read: " + token);
-            return token.toString();
-        } else {
-            return null;
-        }
-    }
-
-    /**
-     * Compute the probability that "token" is SPAM.
-     *
-     * @param token
-     * @return The probability that the token occurs within spam.
-     */
-    private double computeProbability(String token) {
-        double hamFactor = 0;
-        double spamFactor = 0;
-
-        boolean foundInHam = false;
-        boolean foundInSpam = false;
-
-        double minThreshold = 0.01;
-        double maxThreshold = 0.99;
-
-        if (hamTokenCounts.containsKey(token)) {
-            foundInHam = true;
-        }
-
-        if (spamTokenCounts.containsKey(token)) {
-            foundInSpam = true;
-        }
-
-        if (foundInHam) {
-            hamFactor = 2 * hamTokenCounts.get(token).doubleValue();
-            if (!foundInSpam) {
-                minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
-            }
-        }
-
-        if (foundInSpam) {
-            spamFactor = spamTokenCounts.get(token).doubleValue();
-            if (!foundInHam) {
-                maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
-            }
-        }
-
-        if ((hamFactor + spamFactor) < 5) {
-            // This token hasn't been seen enough.
-            return 0.4;
-        }
-
-        double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
-        double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
-
-        return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
-    }
-
-    /**
-     * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and
-     * the tokens passed in the "tokens" Set. The ordering is from the highest
-     * strength to the lowest strength.
-     *
-     * @param tokens
-     * @param workCorpus
-     * @return SortedSet of TokenProbabilityStrength objects.
-     */
-    private SortedSet<TokenProbabilityStrength> getTokenProbabilityStrengths(Set<String> tokens, Map<String, Double> workCorpus) {
-        // Convert to a SortedSet of token probability strengths.
-        SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = new TreeSet<>();
-
-        for (String token : tokens) {
-            TokenProbabilityStrength tps = new TokenProbabilityStrength();
-
-            tps.token = token;
-
-            if (workCorpus.containsKey(tps.token)) {
-                tps.strength = Math.abs(0.5 - workCorpus.get(tps.token));
-            } else {
-                // This token has never been seen before,
-                // we'll give it initially the default probability.
-                Double corpusProbability = DEFAULT_TOKEN_PROBABILITY;
-                tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
-                boolean isTokenDegeneratedFound = false;
-
-                Collection<String> degeneratedTokens = buildDegenerated(tps.token);
-                Iterator<String> iDegenerated = degeneratedTokens.iterator();
-                String tokenDegenerated;
-                double strengthDegenerated;
-                while (iDegenerated.hasNext()) {
-                    tokenDegenerated = iDegenerated.next();
-                    if (workCorpus.containsKey(tokenDegenerated)) {
-                        Double probabilityTemp = workCorpus.get(tokenDegenerated);
-                        strengthDegenerated = Math.abs(0.5 - probabilityTemp);
-                        if (strengthDegenerated > tps.strength) {
-                            isTokenDegeneratedFound = true;
-                            tps.strength = strengthDegenerated;
-                            corpusProbability = probabilityTemp;
-                        }
-                    }
-                }
-                // to reduce memory usage, put in the corpus only if the
-                // probability is different from (stronger than) the default
-                if (isTokenDegeneratedFound) {
-                    synchronized (lock) {
-                        workCorpus.put(tps.token, corpusProbability);
-                    }
-                }
-            }
-
-            tokenProbabilityStrengths.add(tps);
-        }
-
-        return tokenProbabilityStrengths;
-    }
-
-    private Collection<String> buildDegenerated(String fullToken) {
-        ArrayList<String> tokens = new ArrayList<>();
-        String header;
-        String token;
-        String tokenLower;
-
-        // look for a header string termination
-        int headerEnd = fullToken.indexOf(':');
-        if (headerEnd >= 0) {
-            header = fullToken.substring(0, headerEnd);
-            token = fullToken.substring(headerEnd);
-        } else {
-            header = "";
-            token = fullToken;
-        }
-
-        // prepare a version of the token containing all lower case (for
-        // performance reasons)
-        tokenLower = token.toLowerCase(Locale.US);
-
-        int end = token.length();
-        do {
-            if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
-                tokens.add(header + tokenLower.substring(0, end));
-                if (header.length() > 0) {
-                    tokens.add(tokenLower.substring(0, end));
-                }
-            }
-            if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
-                tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
-                if (header.length() > 0) {
-                    tokens.add(token.charAt(0) + tokenLower.substring(1, end));
-                }
-            }
-
-            if (token.charAt(end - 1) != '!') {
-                break;
-            }
-
-            end--;
-
-            tokens.add(header + token.substring(0, end));
-            if (header.length() > 0) {
-                tokens.add(token.substring(0, end));
-            }
-        } while (end > 0);
-
-        return tokens;
-    }
-
-    /**
-     * Compute the spamminess probability of the interesting tokens in the
-     * tokenProbabilities SortedSet.
-     *
-     * @param tokenProbabilityStrengths
-     * @param workCorpus
-     * @return Computed spamminess.
-     */
-    private double computeOverallProbability(SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths, Map<String, Double> workCorpus) {
-        double p = 1.0;
-        double np = 1.0;
-        double tempStrength = 0.5;
-        int count = MAX_INTERESTING_TOKENS;
-        Iterator<TokenProbabilityStrength> iterator = tokenProbabilityStrengths.iterator();
-        while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
-            TokenProbabilityStrength tps = iterator.next();
-            tempStrength = tps.strength;
-
-            // System.out.println(tps);
-
-            double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it
-            // to the default
-            Double theDoubleObject = workCorpus.get(tps.token);
-            // if either the original token or a degeneration was found use the
-            // double value, otherwise use the default
-            if (theDoubleObject != null) {
-                theDoubleValue = theDoubleObject;
-            }
-            p *= theDoubleValue;
-            np *= (1.0 - theDoubleValue);
-            // System.out.println("Token " + tps + ", p=" + theDoubleValue +
-            // ", overall p=" + p / (p + np));
-        }
-
-        return (p / (p + np));
-    }
-
-    private boolean allDigits(String s) {
-        return IntStream.range(0, s.length())
-            .allMatch(i -> Character.isDigit(s.charAt(i)));
-    }
-}
diff --git a/server/container/util/src/main/java/org/apache/james/util/bayesian/JDBCBayesianAnalyzer.java b/server/container/util/src/main/java/org/apache/james/util/bayesian/JDBCBayesianAnalyzer.java
deleted file mode 100644
index 60dbd9d..0000000
--- a/server/container/util/src/main/java/org/apache/james/util/bayesian/JDBCBayesianAnalyzer.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/****************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one   *
- * or more contributor license agreements.  See the NOTICE file *
- * distributed with this work for additional information        *
- * regarding copyright ownership.  The ASF licenses this file   *
- * to you under the Apache License, Version 2.0 (the            *
- * "License"); you may not use this file except in compliance   *
- * with the License.  You may obtain a copy of the License at   *
- *                                                              *
- *   http://www.apache.org/licenses/LICENSE-2.0                 *
- *                                                              *
- * Unless required by applicable law or agreed to in writing,   *
- * software distributed under the License is distributed on an  *
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
- * KIND, either express or implied.  See the License for the    *
- * specific language governing permissions and limitations      *
- * under the License.                                           *
- ****************************************************************/
-
-package org.apache.james.util.bayesian;
-
-import java.io.File;
-import java.sql.Connection;
-import java.sql.DatabaseMetaData;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.james.util.sql.JDBCUtil;
-import org.apache.james.util.sql.SqlResources;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Manages the persistence of the spam bayesian analysis corpus using a JDBC
- * database.
- * 
- * <p>
- * This class is abstract to allow implementations to take advantage of
- * different logging capabilities/interfaces in different parts of the code.
- * </p>
- * 
- * @since 2.3.0
- */
-
-public class JDBCBayesianAnalyzer extends BayesianAnalyzer {
-
-    private static final Logger LOGGER = LoggerFactory.getLogger(JDBCBayesianAnalyzer.class);
-
-    /** Public object representing a lock on database activity. */
-    public static final String DATABASE_LOCK = "database lock";
-
-    /**
-     * The JDBCUtil helper class
-     */
-    private final JDBCUtil theJDBCUtil = new JDBCUtil();
-
-    /** Contains all of the sql strings for this component. */
-    private final SqlResources sqlQueries = new SqlResources();
-
-    /** Holds value of property sqlFileName. */
-    private String sqlFileName;
-
-    /** Holds value of property sqlParameters. */
-    private Map<String, String> sqlParameters = new HashMap<>();
-
-    /** Holds value of property lastDatabaseUpdateTime. */
-    private static long lastDatabaseUpdateTime;
-
-    /**
-     * Getter for property sqlFileName.
-     * 
-     * @return Value of property sqlFileName.
-     */
-    public String getSqlFileName() {
-
-        return this.sqlFileName;
-    }
-
-    /**
-     * Setter for property sqlFileName.
-     * 
-     * @param sqlFileName
-     *            New value of property sqlFileName.
-     */
-    public void setSqlFileName(String sqlFileName) {
-
-        this.sqlFileName = sqlFileName;
-    }
-
-    /**
-     * Getter for property sqlParameters.
-     * 
-     * @return Value of property sqlParameters.
-     */
-    public Map<String, String> getSqlParameters() {
-
-        return this.sqlParameters;
-    }
-
-    /**
-     * Setter for property sqlParameters.
-     * 
-     * @param sqlParameters
-     *            New value of property sqlParameters.
-     */
-    public void setSqlParameters(Map<String, String> sqlParameters) {
-
-        this.sqlParameters = sqlParameters;
-    }
-
-    /**
-     * Getter for static lastDatabaseUpdateTime.
-     * 
-     * @return Value of property lastDatabaseUpdateTime.
-     */
-    public static long getLastDatabaseUpdateTime() {
-
-        return lastDatabaseUpdateTime;
-    }
-
-    /**
-     * Sets static lastDatabaseUpdateTime to System.currentTimeMillis().
-     */
-    public static void touchLastDatabaseUpdateTime() {
-
-        lastDatabaseUpdateTime = System.currentTimeMillis();
-    }
-
-    /**
-     * Default constructor.
-     */
-    public JDBCBayesianAnalyzer() {
-    }
-
-    /**
-     * Loads the token frequencies from the database.
-     * 
-     * @param conn
-     *            The connection for accessing the database
-     * @throws SQLException
-     *             If a database error occurs
-     */
-    public void loadHamNSpam(Connection conn) throws java.sql.SQLException {
-        PreparedStatement pstmt = null;
-        ResultSet rs = null;
-
-        try {
-            pstmt = conn.prepareStatement(sqlQueries.getSqlString("selectHamTokens", true));
-            rs = pstmt.executeQuery();
-
-            Map<String, Integer> ham = getHamTokenCounts();
-            while (rs.next()) {
-                String token = rs.getString(1);
-                int count = rs.getInt(2);
-                // to reduce memory, use the token only if the count is > 1
-                if (count > 1) {
-                    ham.put(token, count);
-                }
-            }
-            // Verbose.
-            LOGGER.debug("Ham tokens count: {}", ham.size());
-
-            rs.close();
-            pstmt.close();
-
-            // Get the spam tokens/counts.
-            pstmt = conn.prepareStatement(sqlQueries.getSqlString("selectSpamTokens", true));
-            rs = pstmt.executeQuery();
-
-            Map<String, Integer> spam = getSpamTokenCounts();
-            while (rs.next()) {
-                String token = rs.getString(1);
-                int count = rs.getInt(2);
-                // to reduce memory, use the token only if the count is > 1
-                if (count > 1) {
-                    spam.put(token, count);
-                }
-            }
-
-            // Verbose.
-            LOGGER.error("Spam tokens count: {}", spam.size());
-
-            rs.close();
-            pstmt.close();
-
-            // Get the ham/spam message counts.
-            pstmt = conn.prepareStatement(sqlQueries.getSqlString("selectMessageCounts", true));
-            rs = pstmt.executeQuery();
-            if (rs.next()) {
-                setHamMessageCount(rs.getInt(1));
-                setSpamMessageCount(rs.getInt(2));
-            }
-
-            rs.close();
-            pstmt.close();
-
-        } finally {
-            if (rs != null) {
-                try {
-                    rs.close();
-                } catch (java.sql.SQLException se) {
-                    LOGGER.info("Exception ignored", se);
-                }
-
-                rs = null;
-            }
-
-            if (pstmt != null) {
-                try {
-                    pstmt.close();
-                } catch (java.sql.SQLException se) {
-                    LOGGER.info("Exception ignored", se);
-                }
-
-                pstmt = null;
-            }
-        }
-    }
-
-    /**
-     * Updates the database with new "ham" token frequencies.
-     * 
-     * @param conn
-     *            The connection for accessing the database
-     * @throws SQLException
-     *             If a database error occurs
-     */
-    public void updateHamTokens(Connection conn) throws java.sql.SQLException {
-        updateTokens(conn, getHamTokenCounts(), sqlQueries.getSqlString("insertHamToken", true), sqlQueries.getSqlString("updateHamToken", true));
-
-        setMessageCount(conn, sqlQueries.getSqlString("updateHamMessageCounts", true), getHamMessageCount());
-    }
-
-    /**
-     * Updates the database with new "spam" token frequencies.
-     * 
-     * @param conn
-     *            The connection for accessing the database
-     * @throws SQLException
-     *             If a database error occurs
-     */
-    public void updateSpamTokens(Connection conn) throws java.sql.SQLException {
-        updateTokens(conn, getSpamTokenCounts(), sqlQueries.getSqlString("insertSpamToken", true), sqlQueries.getSqlString("updateSpamToken", true));
-
-        setMessageCount(conn, sqlQueries.getSqlString("updateSpamMessageCounts", true), getSpamMessageCount());
-    }
-
-    /**
-     * Reset all trained data
-     * 
-     * @param conn
-     *            The connection for accessing the database
-     * @throws SQLException
-     *             If a database error occours
-     */
-    public void resetData(Connection conn) throws SQLException {
-        deleteData(conn, sqlQueries.getSqlString("deleteHamTokens", true));
-        deleteData(conn, sqlQueries.getSqlString("deleteSpamTokens", true));
-        deleteData(conn, sqlQueries.getSqlString("deleteMessageCounts", true));
-    }
-
-    private void setMessageCount(Connection conn, String sqlStatement, int count) throws java.sql.SQLException {
-        // set the ham/spam message counts.
-        try (PreparedStatement init = conn.prepareStatement(sqlQueries.getSqlString("initializeMessageCounts", true));
-            PreparedStatement update = conn.prepareStatement(sqlStatement)) {
-            
-            update.setInt(1, count);
-            if (update.executeUpdate() == 0) {
-                init.executeUpdate();
-                update.executeUpdate();
-            }
-        }
-    }
-
-    private void updateTokens(Connection conn, Map<String, Integer> tokens, String insertSqlStatement, String updateSqlStatement) throws java.sql.SQLException {
-        
-        try (
-            // Used to insert new token entries.
-            PreparedStatement insert = conn.prepareStatement(insertSqlStatement);
-            // Used to update existing token entries.
-            PreparedStatement update = conn.prepareStatement(updateSqlStatement)) {
-            
-            for (Map.Entry<String, Integer> entry : tokens.entrySet()) {
-                update.setInt(1, entry.getValue());
-                update.setString(2, entry.getKey());
-
-                // If the update affected 0 (zero) rows, then the token hasn't
-                // been
-                // encountered before, and we need to add it to the corpus.
-                if (update.executeUpdate() == 0) {
-                    insert.setString(1, entry.getKey());
-                    insert.setInt(2, entry.getValue());
-
-                    insert.executeUpdate();
-                }
-            }
-        }
-    }
-
-    /**
-     * Initializes the sql query environment from the SqlResources file. Will
-     * look for conf/sqlResources.xml.
-     * 
-     * @param conn
-     *            The connection for accessing the database
-     * @param sqlFile
-     *            The sqlResources.xml file
-     * @throws Exception
-     *             If any error occurs
-     */
-    public void initSqlQueries(Connection conn, File sqlFile) throws Exception {
-        try {
-            if (conn.getAutoCommit()) {
-                conn.setAutoCommit(false);
-            }
-
-            sqlQueries.init(sqlFile, JDBCBayesianAnalyzer.class.getName(), conn, getSqlParameters());
-
-            checkTables(conn);
-        } finally {
-            theJDBCUtil.closeJDBCConnection(conn);
-        }
-    }
-
-    private void checkTables(Connection conn) throws SQLException {
-        // Need to ask in the case that identifiers are stored, ask the
-        // DatabaseMetaInfo.
-        // Try UPPER, lower, and MixedCase, to see if the table is there.
-
-        boolean dbUpdated;
-
-        dbUpdated = createTable(conn, "hamTableName", "createHamTable");
-
-        dbUpdated = createTable(conn, "spamTableName", "createSpamTable");
-
-        dbUpdated = createTable(conn, "messageCountsTableName", "createMessageCountsTable");
-
-        // Commit our changes if necessary.
-        if (conn != null && dbUpdated && !conn.getAutoCommit()) {
-            conn.commit();
-            dbUpdated = false;
-        }
-
-    }
-
-    private boolean createTable(Connection conn, String tableNameSqlStringName, String createSqlStringName) throws SQLException {
-        String tableName = sqlQueries.getSqlString(tableNameSqlStringName, true);
-
-        DatabaseMetaData dbMetaData = conn.getMetaData();
-
-        // Try UPPER, lower, and MixedCase, to see if the table is there.
-        if (theJDBCUtil.tableExists(dbMetaData, tableName)) {
-            return false;
-        }
-
-        PreparedStatement createStatement = null;
-
-        try {
-            createStatement = conn.prepareStatement(sqlQueries.getSqlString(createSqlStringName, true));
-            createStatement.execute();
-
-            LOGGER.error("Created table '{}' using sqlResources string '{}'.", tableName, createSqlStringName);
-
-        } finally {
-            theJDBCUtil.closeJDBCStatement(createStatement);
-        }
-
-        return true;
-    }
-
-    private void deleteData(Connection conn, String deleteSqlStatement) throws SQLException {
-        try (// Used to delete ham tokens
-                PreparedStatement delete = conn.prepareStatement(deleteSqlStatement);
-                ) {
-            delete.executeUpdate();
-        }
-    }
-}

---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@james.apache.org
For additional commands, e-mail: notifications-help@james.apache.org