You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@james.apache.org by bt...@apache.org on 2021/07/06 01:27:06 UTC
[james-project] branch master updated: [REFACTORING] Remove unused
BayesianAnalyzer and related class (#526)
This is an automated email from the ASF dual-hosted git repository.
btellier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git
The following commit(s) were added to refs/heads/master by this push:
new 0b1d460 [REFACTORING] Remove unused BayesianAnalyzer and related class (#526)
0b1d460 is described below
commit 0b1d460c5e7a00ebf2eb85a2fcef4f736fc5e8c1
Author: Tellier Benoit <bt...@linagora.com>
AuthorDate: Tue Jul 6 08:26:56 2021 +0700
[REFACTORING] Remove unused BayesianAnalyzer and related class (#526)
---
.../spring-app/src/main/resources/sqlResources.xml | 136 ----
.../james/util/bayesian/BayesianAnalyzer.java | 714 ---------------------
.../james/util/bayesian/JDBCBayesianAnalyzer.java | 381 -----------
3 files changed, 1231 deletions(-)
diff --git a/server/apps/spring-app/src/main/resources/sqlResources.xml b/server/apps/spring-app/src/main/resources/sqlResources.xml
index 286a994..07f7c7b 100644
--- a/server/apps/spring-app/src/main/resources/sqlResources.xml
+++ b/server/apps/spring-app/src/main/resources/sqlResources.xml
@@ -335,142 +335,6 @@
</sql>
</sqlDefs>
-
-<!-- SQL statements to support the BayesianAnalysis and the BayesianAnalysisFeeder mailets -->
-<!-- -->
- <sqlDefs name="org.apache.james.util.bayesian.JDBCBayesianAnalyzer">
-
- <sql name="hamTableName">bayesiananalysis_ham</sql>
- <sql name="spamTableName">bayesiananalysis_spam</sql>
- <sql name="messageCountsTableName">bayesiananalysis_messagecounts</sql>
-
- <!-- Statements used to retrieve the message counts. -->
- <sql name="selectMessageCounts">SELECT HAMCOUNT, SPAMCOUNT FROM bayesiananalysis_messagecounts</sql>
-
- <!-- Statements used to initialize the message counts. -->
- <sql name="initializeMessageCounts">INSERT INTO bayesiananalysis_messagecounts (HAMCOUNT, SPAMCOUNT) VALUES (0,0)</sql>
-
- <!-- Statements used to update the ham message counts. -->
- <sql name="updateHamMessageCounts">UPDATE bayesiananalysis_messagecounts SET HAMCOUNT=(HAMCOUNT + ?)</sql>
-
- <!-- Statements used to update the spam message counts. -->
- <sql name="updateSpamMessageCounts">UPDATE bayesiananalysis_messagecounts SET SPAMCOUNT=(SPAMCOUNT + ?)</sql>
-
- <!-- Statements used to retrieve the ham token counts. -->
- <sql name="selectHamTokens">SELECT TOKEN, OCCURRENCES FROM bayesiananalysis_ham</sql>
-
- <!-- Statements used to retrieve the spam token counts. -->
- <sql name="selectSpamTokens">SELECT TOKEN, OCCURRENCES FROM bayesiananalysis_spam</sql>
-
- <!-- Statements used to insert the ham token counts. -->
- <sql name="insertHamToken">INSERT INTO bayesiananalysis_ham (TOKEN, OCCURRENCES) VALUES (?,?)</sql>
-
- <!-- Statements used to insert the spam token counts. -->
- <sql name="insertSpamToken">INSERT INTO bayesiananalysis_spam (TOKEN, OCCURRENCES) VALUES (?,?)</sql>
-
- <!-- Statements used to update the ham token counts. -->
- <sql name="updateHamToken">UPDATE bayesiananalysis_ham SET OCCURRENCES=(OCCURRENCES + ?) WHERE (TOKEN=?)</sql>
-
- <!-- Statements used to update the spam token counts. -->
- <sql name="updateSpamToken">UPDATE bayesiananalysis_spam SET OCCURRENCES=(OCCURRENCES + ?) WHERE (TOKEN=?)</sql>
-
- <!-- Statements used to delete ham tokens. -->
- <sql name="deleteHamTokens">DELETE FROM bayesiananalysis_ham</sql>
-
- <!-- Statements used to delete spam tokens. -->
- <sql name="deleteSpamTokens">DELETE FROM bayesiananalysis_spam</sql>
-
- <!-- Statements used to delete message counts. -->
- <sql name="deleteMessageCounts">DELETE FROM bayesiananalysis_messagecounts</sql>
-
-
- <!-- Statements used to create the "ham" table (the 'token' field must be case sensitive). -->
- <sql name="createHamTable" db="mysql">
- CREATE TABLE bayesiananalysis_ham (
- token varchar(128) binary NOT NULL default '',
- occurrences int(11) NOT NULL default '0',
- PRIMARY KEY (token)
- ) TYPE=InnoDB
- </sql>
- <sql name="createHamTable" db="mssql">
- CREATE TABLE [bayesiananalysis_ham] (
- [token] [varchar] (128) COLLATE Latin1_General_CS_AS NOT NULL,
- [occurrences] [int] NOT NULL default (0),
- PRIMARY KEY (token)
- )
- </sql>
- <sql name="createHamTable" db="derby">
- CREATE TABLE bayesiananalysis_ham (
- token varchar(128) NOT NULL,
- occurrences INTEGER NOT NULL default 0,
- PRIMARY KEY (token)
- )
- </sql>
- <sql name="createHamTable" db="postgresql">
- CREATE TABLE bayesiananalysis_ham (
- token varchar(128) NOT NULL,
- occurrences int NOT NULL default 0,
- PRIMARY KEY (token)
- )
- </sql>
-
- <!-- Statements used to create the "spam" table (the 'token' field must be case sensitive). -->
- <sql name="createSpamTable" db="mysql">
- CREATE TABLE bayesiananalysis_spam (
- token varchar(128) binary NOT NULL default '',
- occurrences int(11) NOT NULL default '0',
- PRIMARY KEY (token)
- ) TYPE=InnoDB
- </sql>
- <sql name="createSpamTable" db="mssql">
- CREATE TABLE [bayesiananalysis_spam] (
- [token] [varchar] (128) COLLATE Latin1_General_CS_AS NOT NULL,
- [occurrences] [int] NOT NULL default (0),
- PRIMARY KEY (token)
- )
- </sql>
- <sql name="createSpamTable" db="derby">
- CREATE TABLE bayesiananalysis_spam (
- token varchar (128) NOT NULL,
- occurrences INTEGER NOT NULL default 0,
- PRIMARY KEY (token)
- )
- </sql>
- <sql name="createSpamTable" db="postgresql">
- CREATE TABLE bayesiananalysis_spam (
- token varchar (128) NOT NULL,
- occurrences int NOT NULL default 0,
- PRIMARY KEY (token)
- )
- </sql>
-
- <!-- Statements used to create the "message counts" table. -->
- <sql name="createMessageCountsTable" db="mysql">
- CREATE TABLE bayesiananalysis_messagecounts (
- hamcount int(11) NOT NULL default '0',
- spamcount int(11) NOT NULL default '0'
- ) TYPE=InnoDB
- </sql>
- <sql name="createMessageCountsTable" db="mssql">
- CREATE TABLE [bayesiananalysis_messagecounts] (
- [hamcount] [int] NOT NULL default (0),
- [spamcount] [int] NOT NULL default (0)
- )
- </sql>
- <sql name="createMessageCountsTable" db="derby">
- CREATE TABLE bayesiananalysis_messagecounts (
- hamcount INTEGER NOT NULL default 0,
- spamcount INTEGER NOT NULL default 0
- )
- </sql>
- <sql name="createMessageCountsTable" db="postgresql">
- CREATE TABLE bayesiananalysis_messagecounts (
- hamcount int NOT NULL default 0,
- spamcount int NOT NULL default 0
- )
- </sql>
- </sqlDefs>
-
<!-- SQL statements to support the WhiteListManager mailet and the IsInWhiteList matcher -->
<!-- -->
<sqlDefs name="WhiteList">
diff --git a/server/container/util/src/main/java/org/apache/james/util/bayesian/BayesianAnalyzer.java b/server/container/util/src/main/java/org/apache/james/util/bayesian/BayesianAnalyzer.java
deleted file mode 100644
index bf6391d..0000000
--- a/server/container/util/src/main/java/org/apache/james/util/bayesian/BayesianAnalyzer.java
+++ /dev/null
@@ -1,714 +0,0 @@
-/****************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one *
- * or more contributor license agreements. See the NOTICE file *
- * distributed with this work for additional information *
- * regarding copyright ownership. The ASF licenses this file *
- * to you under the Apache License, Version 2.0 (the *
- * "License"); you may not use this file except in compliance *
- * with the License. You may obtain a copy of the License at *
- * *
- * http://www.apache.org/licenses/LICENSE-2.0 *
- * *
- * Unless required by applicable law or agreed to in writing, *
- * software distributed under the License is distributed on an *
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
- * KIND, either express or implied. See the License for the *
- * specific language governing permissions and limitations *
- * under the License. *
- ****************************************************************/
-
-package org.apache.james.util.bayesian;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.stream.IntStream;
-
-/**
- * <p>
- * Determines probability that text contains Spam.
- * </p>
- * <p/>
- * <p>
- * Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan
- * for Spam</a>. Extended to Paul Grahams' <a
- * href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.
- * </p>
- * <p/>
- * <p>
- * Sample method usage:
- * </p>
- * <p/>
- * <p>
- * Use: void addHam(Reader) and void addSpam(Reader)
- * <p/>
- * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam
- * and addSpam assume they're reading one message at a time, if you feed more
- * than one message per call, be sure to adjust the appropriate message counter:
- * hamMessageCount or spamMessageCount.
- * <p/>
- * Then...
- * </p>
- * <p/>
- * <p>
- * Use: void buildCorpus()
- * <p/>
- * to build the final token/probabilities Map.
- * <p/>
- * Use your own methods for persistent storage of either the individual ham/spam
- * corpus & message counts, and/or the final corpus.
- * <p/>
- * Then you can...
- * </p>
- * <p/>
- * <p>
- * Use: double computeSpamProbability(Reader)
- * <p/>
- * to determine the probability that a particular text contains spam. A returned
- * result of 0.9 or above is an indicator that the text was spam.
- * </p>
- * <p/>
- * <p>
- * If you use persistent storage, use: void setCorpus(Map)
- * <p/>
- * before calling computeSpamProbability.
- * </p>
- *
- * @since 2.3.0
- */
-
-public class BayesianAnalyzer {
-
- private final Object lock = new Object();
-
- /**
- * Number of "interesting" tokens to use to compute overall spamminess
- * probability.
- */
- private static final int MAX_INTERESTING_TOKENS = 15;
-
- /**
- * Minimum probability distance from 0.5 to consider a token "interesting"
- * to use to compute overall spamminess probability.
- */
- private static final double INTERESTINGNESS_THRESHOLD = 0.46;
-
- /**
- * Default token probability to use when a token has not been encountered
- * before.
- */
- private static final double DEFAULT_TOKEN_PROBABILITY = 0.4;
-
- /**
- * Map of ham tokens and their occurrences.
- */
- private Map<String, Integer> hamTokenCounts = new HashMap<>();
-
- /**
- * Map of spam tokens and their occurrences.
- */
- private Map<String, Integer> spamTokenCounts = new HashMap<>();
-
- /**
- * Number of ham messages analyzed.
- */
- private int hamMessageCount = 0;
-
- /**
- * Number of spam messages analyzed.
- */
- private int spamMessageCount = 0;
-
- /**
- * Final token/probability corpus.
- */
- private Map<String, Double> corpus = new HashMap<>();
-
- /**
- * Inner class for managing Token Probability Strengths during the
- * computeSpamProbability phase.
- * <p/>
- * By probability <i>strength</i> we mean the absolute distance of a
- * probability from the middle value 0.5.
- * <p/>
- * It implements Comparable so that it's sorting is automatic.
- */
- private static class TokenProbabilityStrength implements Comparable<TokenProbabilityStrength> {
- /**
- * Message token.
- */
- String token = null;
-
- /**
- * Token's computed probability strength.
- */
- double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
-
- /**
- * Force the natural sort order for this object to be high-to-low.
- *
- * @param anotherTokenProbabilityStrength
- * A TokenProbabilityStrength instance to compare this
- * instance with.
- * @return The result of the comparison (before, equal, after).
- */
- @Override
- public final int compareTo(TokenProbabilityStrength anotherTokenProbabilityStrength) {
- int result = (int) ((anotherTokenProbabilityStrength.strength - strength) * 1000000);
- if (result == 0) {
- return this.token.compareTo(anotherTokenProbabilityStrength.token);
- } else {
- return result;
- }
- }
-
- /**
- * Simple toString () implementation mostly for debugging purposes.
- *
- * @return String representation of this object.
- */
- public String toString() {
- StringBuilder sb = new StringBuilder(30);
-
- sb.append(token).append("=").append(strength);
-
- return sb.toString();
- }
- }
-
- /**
- * Basic class constructor.
- */
- public BayesianAnalyzer() {
- }
-
- /**
- * Public setter for the hamTokenCounts Map.
- *
- * @param hamTokenCounts The new ham Token counts Map.
- */
- public void setHamTokenCounts(Map<String, Integer> hamTokenCounts) {
- this.hamTokenCounts = hamTokenCounts;
- }
-
- /**
- * Public getter for the hamTokenCounts Map.
- */
- public Map<String, Integer> getHamTokenCounts() {
- return this.hamTokenCounts;
- }
-
- /**
- * Public setter for the spamTokenCounts Map.
- *
- * @param spamTokenCounts The new spam Token counts Map.
- */
- public void setSpamTokenCounts(Map<String, Integer> spamTokenCounts) {
- this.spamTokenCounts = spamTokenCounts;
- }
-
- /**
- * Public getter for the spamTokenCounts Map.
- */
- public Map<String, Integer> getSpamTokenCounts() {
- return this.spamTokenCounts;
- }
-
- /**
- * Public setter for spamMessageCount.
- *
- * @param spamMessageCount The new spam message count.
- */
- public void setSpamMessageCount(int spamMessageCount) {
- this.spamMessageCount = spamMessageCount;
- }
-
- /**
- * Public getter for spamMessageCount.
- */
- public int getSpamMessageCount() {
- return this.spamMessageCount;
- }
-
- /**
- * Public setter for hamMessageCount.
- *
- * @param hamMessageCount The new ham message count.
- */
- public void setHamMessageCount(int hamMessageCount) {
- this.hamMessageCount = hamMessageCount;
- }
-
- /**
- * Public getter for hamMessageCount.
- */
- public int getHamMessageCount() {
- return this.hamMessageCount;
- }
-
- /**
- * Clears all analysis repositories and counters.
- */
- public void clear() {
- corpus.clear();
-
- tokenCountsClear();
-
- hamMessageCount = 0;
- spamMessageCount = 0;
- }
-
- /**
- * Clears token counters.
- */
- public void tokenCountsClear() {
- hamTokenCounts.clear();
- spamTokenCounts.clear();
- }
-
- /**
- * Public setter for corpus.
- *
- * @param corpus The new corpus.
- */
- public void setCorpus(Map<String, Double> corpus) {
- this.corpus = corpus;
- }
-
- /**
- * Public getter for corpus.
- */
- public Map<String, Double> getCorpus() {
- return this.corpus;
- }
-
- /**
- * Builds the corpus from the existing ham & spam counts.
- */
- public void buildCorpus() {
- // Combine the known ham & spam tokens.
- Set<String> set = new HashSet<>(hamTokenCounts.size() + spamTokenCounts.size());
- set.addAll(hamTokenCounts.keySet());
- set.addAll(spamTokenCounts.keySet());
- Map<String, Double> tempCorpus = new HashMap<>(set.size());
-
- // Iterate through all the tokens and compute their new
- // individual probabilities.
- for (String token : set) {
- tempCorpus.put(token, computeProbability(token));
- }
- setCorpus(tempCorpus);
- }
-
- /**
- * Adds a message to the ham list.
- *
- * @param stream A reader stream on the ham message to analyze
- * @throws IOException If any error occurs
- */
- public void addHam(Reader stream) throws java.io.IOException {
- addTokenOccurrences(stream, hamTokenCounts);
- hamMessageCount++;
- }
-
- /**
- * Adds a message to the spam list.
- *
- * @param stream A reader stream on the spam message to analyze
- * @throws IOException If any error occurs
- */
- public void addSpam(Reader stream) throws java.io.IOException {
- addTokenOccurrences(stream, spamTokenCounts);
- spamMessageCount++;
- }
-
- /**
- * Computes the probability that the stream contains SPAM.
- *
- * @param stream The text to be analyzed for Spamminess.
- * @return A 0.0 - 1.0 probability
- * @throws IOException If any error occurs
- */
- public double computeSpamProbability(Reader stream) throws java.io.IOException {
- // Build a set of the tokens in the Stream.
- Set<String> tokens = parse(stream);
-
- // Get the corpus to use in this run
- // A new corpus may be being built in the meantime
- Map<String, Double> workCorpus = getCorpus();
-
- // Assign their probabilities from the Corpus (using an additional
- // calculation to determine spamminess).
- SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
-
- // Compute and return the overall probability that the
- // stream is SPAM.
- return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
- }
-
- /**
- * Parses a stream into tokens, and updates the target Map with the
- * token/counts.
- *
- * @param stream
- * @param target
- */
- private void addTokenOccurrences(Reader stream, Map<String, Integer> target) throws java.io.IOException {
- String token;
- String header = "";
-
- // Update target with the tokens/count encountered.
- while ((token = nextToken(stream)) != null) {
- boolean endingLine = false;
- if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
- endingLine = true;
- token = token.substring(0, token.length() - 1);
- }
-
- if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
- if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {
- header = token;
- if (!endingLine) {
- continue;
- }
- }
-
- token = header + token;
-
- Integer value;
-
- if (target.containsKey(token)) {
- value = target.get(token) + 1;
- } else {
- value = 1;
- }
-
- target.put(token, value);
- }
-
- if (endingLine) {
- header = "";
- }
- }
- }
-
- /**
- * Parses a stream into tokens, and returns a Set of the unique tokens
- * encountered.
- *
- * @param stream
- * @return Set
- */
- private Set<String> parse(Reader stream) throws java.io.IOException {
- Set<String> tokens = new HashSet<>();
- String token;
- String header = "";
-
- // Build a Map of tokens encountered.
- while ((token = nextToken(stream)) != null) {
- boolean endingLine = false;
- if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
- endingLine = true;
- token = token.substring(0, token.length() - 1);
- }
-
- if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
- if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {
- header = token;
- if (!endingLine) {
- continue;
- }
- }
-
- token = header + token;
-
- tokens.add(token);
- }
-
- if (endingLine) {
- header = "";
- }
- }
-
- // Return the unique set of tokens encountered.
- return tokens;
- }
-
- private String nextToken(Reader reader) throws java.io.IOException {
- StringBuilder token = new StringBuilder();
- int i;
- char ch;
- char ch2;
- boolean previousWasDigit = false;
- boolean tokenCharFound = false;
-
- if (!reader.ready()) {
- return null;
- }
-
- while ((i = reader.read()) != -1) {
-
- ch = (char) i;
-
- if (ch == ':') {
- String tokenString = token.toString() + ':';
- if (tokenString.equals("From:") || tokenString.equals("Return-Path:") || tokenString.equals("Subject:") || tokenString.equals("To:")) {
- return tokenString;
- }
- }
-
- if (Character.isLetter(ch) || ch == '-' || ch == '$' || ch == '€'
- || ch == '!' || ch == '\'') {
- tokenCharFound = true;
- previousWasDigit = false;
- token.append(ch);
- } else if (Character.isDigit(ch)) {
- tokenCharFound = true;
- previousWasDigit = true;
- token.append(ch);
- } else if (previousWasDigit && (ch == '.' || ch == ',')) {
- reader.mark(1);
- previousWasDigit = false;
- i = reader.read();
- if (i == -1) {
- break;
- }
- ch2 = (char) i;
- if (Character.isDigit(ch2)) {
- tokenCharFound = true;
- previousWasDigit = true;
- token.append(ch);
- token.append(ch2);
- } else {
- reader.reset();
- break;
- }
- } else if (ch == '\r') {
- // cr found, ignore
- } else if (ch == '\n') {
- // eol found
- tokenCharFound = true;
- previousWasDigit = false;
- token.append(ch);
- break;
- } else if (tokenCharFound) {
- break;
- }
- }
-
- if (tokenCharFound) {
- // System.out.println("Token read: " + token);
- return token.toString();
- } else {
- return null;
- }
- }
-
- /**
- * Compute the probability that "token" is SPAM.
- *
- * @param token
- * @return The probability that the token occurs within spam.
- */
- private double computeProbability(String token) {
- double hamFactor = 0;
- double spamFactor = 0;
-
- boolean foundInHam = false;
- boolean foundInSpam = false;
-
- double minThreshold = 0.01;
- double maxThreshold = 0.99;
-
- if (hamTokenCounts.containsKey(token)) {
- foundInHam = true;
- }
-
- if (spamTokenCounts.containsKey(token)) {
- foundInSpam = true;
- }
-
- if (foundInHam) {
- hamFactor = 2 * hamTokenCounts.get(token).doubleValue();
- if (!foundInSpam) {
- minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
- }
- }
-
- if (foundInSpam) {
- spamFactor = spamTokenCounts.get(token).doubleValue();
- if (!foundInHam) {
- maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
- }
- }
-
- if ((hamFactor + spamFactor) < 5) {
- // This token hasn't been seen enough.
- return 0.4;
- }
-
- double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
- double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
-
- return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
- }
-
- /**
- * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and
- * the tokens passed in the "tokens" Set. The ordering is from the highest
- * strength to the lowest strength.
- *
- * @param tokens
- * @param workCorpus
- * @return SortedSet of TokenProbabilityStrength objects.
- */
- private SortedSet<TokenProbabilityStrength> getTokenProbabilityStrengths(Set<String> tokens, Map<String, Double> workCorpus) {
- // Convert to a SortedSet of token probability strengths.
- SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = new TreeSet<>();
-
- for (String token : tokens) {
- TokenProbabilityStrength tps = new TokenProbabilityStrength();
-
- tps.token = token;
-
- if (workCorpus.containsKey(tps.token)) {
- tps.strength = Math.abs(0.5 - workCorpus.get(tps.token));
- } else {
- // This token has never been seen before,
- // we'll give it initially the default probability.
- Double corpusProbability = DEFAULT_TOKEN_PROBABILITY;
- tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
- boolean isTokenDegeneratedFound = false;
-
- Collection<String> degeneratedTokens = buildDegenerated(tps.token);
- Iterator<String> iDegenerated = degeneratedTokens.iterator();
- String tokenDegenerated;
- double strengthDegenerated;
- while (iDegenerated.hasNext()) {
- tokenDegenerated = iDegenerated.next();
- if (workCorpus.containsKey(tokenDegenerated)) {
- Double probabilityTemp = workCorpus.get(tokenDegenerated);
- strengthDegenerated = Math.abs(0.5 - probabilityTemp);
- if (strengthDegenerated > tps.strength) {
- isTokenDegeneratedFound = true;
- tps.strength = strengthDegenerated;
- corpusProbability = probabilityTemp;
- }
- }
- }
- // to reduce memory usage, put in the corpus only if the
- // probability is different from (stronger than) the default
- if (isTokenDegeneratedFound) {
- synchronized (lock) {
- workCorpus.put(tps.token, corpusProbability);
- }
- }
- }
-
- tokenProbabilityStrengths.add(tps);
- }
-
- return tokenProbabilityStrengths;
- }
-
- private Collection<String> buildDegenerated(String fullToken) {
- ArrayList<String> tokens = new ArrayList<>();
- String header;
- String token;
- String tokenLower;
-
- // look for a header string termination
- int headerEnd = fullToken.indexOf(':');
- if (headerEnd >= 0) {
- header = fullToken.substring(0, headerEnd);
- token = fullToken.substring(headerEnd);
- } else {
- header = "";
- token = fullToken;
- }
-
- // prepare a version of the token containing all lower case (for
- // performance reasons)
- tokenLower = token.toLowerCase(Locale.US);
-
- int end = token.length();
- do {
- if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
- tokens.add(header + tokenLower.substring(0, end));
- if (header.length() > 0) {
- tokens.add(tokenLower.substring(0, end));
- }
- }
- if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
- tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
- if (header.length() > 0) {
- tokens.add(token.charAt(0) + tokenLower.substring(1, end));
- }
- }
-
- if (token.charAt(end - 1) != '!') {
- break;
- }
-
- end--;
-
- tokens.add(header + token.substring(0, end));
- if (header.length() > 0) {
- tokens.add(token.substring(0, end));
- }
- } while (end > 0);
-
- return tokens;
- }
-
- /**
- * Compute the spamminess probability of the interesting tokens in the
- * tokenProbabilities SortedSet.
- *
- * @param tokenProbabilityStrengths
- * @param workCorpus
- * @return Computed spamminess.
- */
- private double computeOverallProbability(SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths, Map<String, Double> workCorpus) {
- double p = 1.0;
- double np = 1.0;
- double tempStrength = 0.5;
- int count = MAX_INTERESTING_TOKENS;
- Iterator<TokenProbabilityStrength> iterator = tokenProbabilityStrengths.iterator();
- while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
- TokenProbabilityStrength tps = iterator.next();
- tempStrength = tps.strength;
-
- // System.out.println(tps);
-
- double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it
- // to the default
- Double theDoubleObject = workCorpus.get(tps.token);
- // if either the original token or a degeneration was found use the
- // double value, otherwise use the default
- if (theDoubleObject != null) {
- theDoubleValue = theDoubleObject;
- }
- p *= theDoubleValue;
- np *= (1.0 - theDoubleValue);
- // System.out.println("Token " + tps + ", p=" + theDoubleValue +
- // ", overall p=" + p / (p + np));
- }
-
- return (p / (p + np));
- }
-
- private boolean allDigits(String s) {
- return IntStream.range(0, s.length())
- .allMatch(i -> Character.isDigit(s.charAt(i)));
- }
-}
diff --git a/server/container/util/src/main/java/org/apache/james/util/bayesian/JDBCBayesianAnalyzer.java b/server/container/util/src/main/java/org/apache/james/util/bayesian/JDBCBayesianAnalyzer.java
deleted file mode 100644
index 60dbd9d..0000000
--- a/server/container/util/src/main/java/org/apache/james/util/bayesian/JDBCBayesianAnalyzer.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/****************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one *
- * or more contributor license agreements. See the NOTICE file *
- * distributed with this work for additional information *
- * regarding copyright ownership. The ASF licenses this file *
- * to you under the Apache License, Version 2.0 (the *
- * "License"); you may not use this file except in compliance *
- * with the License. You may obtain a copy of the License at *
- * *
- * http://www.apache.org/licenses/LICENSE-2.0 *
- * *
- * Unless required by applicable law or agreed to in writing, *
- * software distributed under the License is distributed on an *
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
- * KIND, either express or implied. See the License for the *
- * specific language governing permissions and limitations *
- * under the License. *
- ****************************************************************/
-
-package org.apache.james.util.bayesian;
-
-import java.io.File;
-import java.sql.Connection;
-import java.sql.DatabaseMetaData;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.james.util.sql.JDBCUtil;
-import org.apache.james.util.sql.SqlResources;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Manages the persistence of the spam bayesian analysis corpus using a JDBC
- * database.
- *
- * <p>
- * This class is abstract to allow implementations to take advantage of
- * different logging capabilities/interfaces in different parts of the code.
- * </p>
- *
- * @since 2.3.0
- */
-
-public class JDBCBayesianAnalyzer extends BayesianAnalyzer {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(JDBCBayesianAnalyzer.class);
-
- /** Public object representing a lock on database activity. */
- public static final String DATABASE_LOCK = "database lock";
-
- /**
- * The JDBCUtil helper class
- */
- private final JDBCUtil theJDBCUtil = new JDBCUtil();
-
- /** Contains all of the sql strings for this component. */
- private final SqlResources sqlQueries = new SqlResources();
-
- /** Holds value of property sqlFileName. */
- private String sqlFileName;
-
- /** Holds value of property sqlParameters. */
- private Map<String, String> sqlParameters = new HashMap<>();
-
- /** Holds value of property lastDatabaseUpdateTime. */
- private static long lastDatabaseUpdateTime;
-
- /**
- * Getter for property sqlFileName.
- *
- * @return Value of property sqlFileName.
- */
- public String getSqlFileName() {
-
- return this.sqlFileName;
- }
-
- /**
- * Setter for property sqlFileName.
- *
- * @param sqlFileName
- * New value of property sqlFileName.
- */
- public void setSqlFileName(String sqlFileName) {
-
- this.sqlFileName = sqlFileName;
- }
-
- /**
- * Getter for property sqlParameters.
- *
- * @return Value of property sqlParameters.
- */
- public Map<String, String> getSqlParameters() {
-
- return this.sqlParameters;
- }
-
- /**
- * Setter for property sqlParameters.
- *
- * @param sqlParameters
- * New value of property sqlParameters.
- */
- public void setSqlParameters(Map<String, String> sqlParameters) {
-
- this.sqlParameters = sqlParameters;
- }
-
- /**
- * Getter for static lastDatabaseUpdateTime.
- *
- * @return Value of property lastDatabaseUpdateTime.
- */
- public static long getLastDatabaseUpdateTime() {
-
- return lastDatabaseUpdateTime;
- }
-
- /**
- * Sets static lastDatabaseUpdateTime to System.currentTimeMillis().
- */
- public static void touchLastDatabaseUpdateTime() {
-
- lastDatabaseUpdateTime = System.currentTimeMillis();
- }
-
- /**
- * Default constructor.
- */
- public JDBCBayesianAnalyzer() {
- }
-
- /**
- * Loads the token frequencies from the database.
- *
- * @param conn
- * The connection for accessing the database
- * @throws SQLException
- * If a database error occurs
- */
- public void loadHamNSpam(Connection conn) throws java.sql.SQLException {
- PreparedStatement pstmt = null;
- ResultSet rs = null;
-
- try {
- pstmt = conn.prepareStatement(sqlQueries.getSqlString("selectHamTokens", true));
- rs = pstmt.executeQuery();
-
- Map<String, Integer> ham = getHamTokenCounts();
- while (rs.next()) {
- String token = rs.getString(1);
- int count = rs.getInt(2);
- // to reduce memory, use the token only if the count is > 1
- if (count > 1) {
- ham.put(token, count);
- }
- }
- // Verbose.
- LOGGER.debug("Ham tokens count: {}", ham.size());
-
- rs.close();
- pstmt.close();
-
- // Get the spam tokens/counts.
- pstmt = conn.prepareStatement(sqlQueries.getSqlString("selectSpamTokens", true));
- rs = pstmt.executeQuery();
-
- Map<String, Integer> spam = getSpamTokenCounts();
- while (rs.next()) {
- String token = rs.getString(1);
- int count = rs.getInt(2);
- // to reduce memory, use the token only if the count is > 1
- if (count > 1) {
- spam.put(token, count);
- }
- }
-
- // Verbose.
- LOGGER.error("Spam tokens count: {}", spam.size());
-
- rs.close();
- pstmt.close();
-
- // Get the ham/spam message counts.
- pstmt = conn.prepareStatement(sqlQueries.getSqlString("selectMessageCounts", true));
- rs = pstmt.executeQuery();
- if (rs.next()) {
- setHamMessageCount(rs.getInt(1));
- setSpamMessageCount(rs.getInt(2));
- }
-
- rs.close();
- pstmt.close();
-
- } finally {
- if (rs != null) {
- try {
- rs.close();
- } catch (java.sql.SQLException se) {
- LOGGER.info("Exception ignored", se);
- }
-
- rs = null;
- }
-
- if (pstmt != null) {
- try {
- pstmt.close();
- } catch (java.sql.SQLException se) {
- LOGGER.info("Exception ignored", se);
- }
-
- pstmt = null;
- }
- }
- }
-
- /**
- * Updates the database with new "ham" token frequencies.
- *
- * @param conn
- * The connection for accessing the database
- * @throws SQLException
- * If a database error occurs
- */
- public void updateHamTokens(Connection conn) throws java.sql.SQLException {
- updateTokens(conn, getHamTokenCounts(), sqlQueries.getSqlString("insertHamToken", true), sqlQueries.getSqlString("updateHamToken", true));
-
- setMessageCount(conn, sqlQueries.getSqlString("updateHamMessageCounts", true), getHamMessageCount());
- }
-
- /**
- * Updates the database with new "spam" token frequencies.
- *
- * @param conn
- * The connection for accessing the database
- * @throws SQLException
- * If a database error occurs
- */
- public void updateSpamTokens(Connection conn) throws java.sql.SQLException {
- updateTokens(conn, getSpamTokenCounts(), sqlQueries.getSqlString("insertSpamToken", true), sqlQueries.getSqlString("updateSpamToken", true));
-
- setMessageCount(conn, sqlQueries.getSqlString("updateSpamMessageCounts", true), getSpamMessageCount());
- }
-
- /**
- * Reset all trained data
- *
- * @param conn
- * The connection for accessing the database
- * @throws SQLException
- * If a database error occours
- */
- public void resetData(Connection conn) throws SQLException {
- deleteData(conn, sqlQueries.getSqlString("deleteHamTokens", true));
- deleteData(conn, sqlQueries.getSqlString("deleteSpamTokens", true));
- deleteData(conn, sqlQueries.getSqlString("deleteMessageCounts", true));
- }
-
- private void setMessageCount(Connection conn, String sqlStatement, int count) throws java.sql.SQLException {
- // set the ham/spam message counts.
- try (PreparedStatement init = conn.prepareStatement(sqlQueries.getSqlString("initializeMessageCounts", true));
- PreparedStatement update = conn.prepareStatement(sqlStatement)) {
-
- update.setInt(1, count);
- if (update.executeUpdate() == 0) {
- init.executeUpdate();
- update.executeUpdate();
- }
- }
- }
-
- private void updateTokens(Connection conn, Map<String, Integer> tokens, String insertSqlStatement, String updateSqlStatement) throws java.sql.SQLException {
-
- try (
- // Used to insert new token entries.
- PreparedStatement insert = conn.prepareStatement(insertSqlStatement);
- // Used to update existing token entries.
- PreparedStatement update = conn.prepareStatement(updateSqlStatement)) {
-
- for (Map.Entry<String, Integer> entry : tokens.entrySet()) {
- update.setInt(1, entry.getValue());
- update.setString(2, entry.getKey());
-
- // If the update affected 0 (zero) rows, then the token hasn't
- // been
- // encountered before, and we need to add it to the corpus.
- if (update.executeUpdate() == 0) {
- insert.setString(1, entry.getKey());
- insert.setInt(2, entry.getValue());
-
- insert.executeUpdate();
- }
- }
- }
- }
-
- /**
- * Initializes the sql query environment from the SqlResources file. Will
- * look for conf/sqlResources.xml.
- *
- * @param conn
- * The connection for accessing the database
- * @param sqlFile
- * The sqlResources.xml file
- * @throws Exception
- * If any error occurs
- */
- public void initSqlQueries(Connection conn, File sqlFile) throws Exception {
- try {
- if (conn.getAutoCommit()) {
- conn.setAutoCommit(false);
- }
-
- sqlQueries.init(sqlFile, JDBCBayesianAnalyzer.class.getName(), conn, getSqlParameters());
-
- checkTables(conn);
- } finally {
- theJDBCUtil.closeJDBCConnection(conn);
- }
- }
-
- private void checkTables(Connection conn) throws SQLException {
- // Need to ask in the case that identifiers are stored, ask the
- // DatabaseMetaInfo.
- // Try UPPER, lower, and MixedCase, to see if the table is there.
-
- boolean dbUpdated;
-
- dbUpdated = createTable(conn, "hamTableName", "createHamTable");
-
- dbUpdated = createTable(conn, "spamTableName", "createSpamTable");
-
- dbUpdated = createTable(conn, "messageCountsTableName", "createMessageCountsTable");
-
- // Commit our changes if necessary.
- if (conn != null && dbUpdated && !conn.getAutoCommit()) {
- conn.commit();
- dbUpdated = false;
- }
-
- }
-
- private boolean createTable(Connection conn, String tableNameSqlStringName, String createSqlStringName) throws SQLException {
- String tableName = sqlQueries.getSqlString(tableNameSqlStringName, true);
-
- DatabaseMetaData dbMetaData = conn.getMetaData();
-
- // Try UPPER, lower, and MixedCase, to see if the table is there.
- if (theJDBCUtil.tableExists(dbMetaData, tableName)) {
- return false;
- }
-
- PreparedStatement createStatement = null;
-
- try {
- createStatement = conn.prepareStatement(sqlQueries.getSqlString(createSqlStringName, true));
- createStatement.execute();
-
- LOGGER.error("Created table '{}' using sqlResources string '{}'.", tableName, createSqlStringName);
-
- } finally {
- theJDBCUtil.closeJDBCStatement(createStatement);
- }
-
- return true;
- }
-
- private void deleteData(Connection conn, String deleteSqlStatement) throws SQLException {
- try (// Used to delete ham tokens
- PreparedStatement delete = conn.prepareStatement(deleteSqlStatement);
- ) {
- delete.executeUpdate();
- }
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@james.apache.org
For additional commands, e-mail: notifications-help@james.apache.org