You are viewing a plain text version of this content. The canonical link for it is here.
Posted to site-dev@james.apache.org by rd...@apache.org on 2011/05/08 18:41:14 UTC

svn commit: r1100766 [6/7] - in /james/site/trunk/www/mailet/ai: ./ apidocs/ apidocs/org/ apidocs/org/apache/ apidocs/org/apache/james/ apidocs/org/apache/james/ai/ apidocs/org/apache/james/ai/classic/ apidocs/org/apache/james/ai/classic/class-use/ api...

Added: james/site/trunk/www/mailet/ai/apidocs/src-html/org/apache/james/ai/classic/BayesianAnalyzer.html
URL: http://svn.apache.org/viewvc/james/site/trunk/www/mailet/ai/apidocs/src-html/org/apache/james/ai/classic/BayesianAnalyzer.html?rev=1100766&view=auto
==============================================================================
--- james/site/trunk/www/mailet/ai/apidocs/src-html/org/apache/james/ai/classic/BayesianAnalyzer.html (added)
+++ james/site/trunk/www/mailet/ai/apidocs/src-html/org/apache/james/ai/classic/BayesianAnalyzer.html Sun May  8 16:41:13 2011
@@ -0,0 +1,785 @@
+<HTML>
+<BODY BGCOLOR="white">
+<PRE>
+<FONT color="green">001</FONT>    /****************************************************************<a name="line.1"></a>
+<FONT color="green">002</FONT>     * Licensed to the Apache Software Foundation (ASF) under one   *<a name="line.2"></a>
+<FONT color="green">003</FONT>     * or more contributor license agreements.  See the NOTICE file *<a name="line.3"></a>
+<FONT color="green">004</FONT>     * distributed with this work for additional information        *<a name="line.4"></a>
+<FONT color="green">005</FONT>     * regarding copyright ownership.  The ASF licenses this file   *<a name="line.5"></a>
+<FONT color="green">006</FONT>     * to you under the Apache License, Version 2.0 (the            *<a name="line.6"></a>
+<FONT color="green">007</FONT>     * "License"); you may not use this file except in compliance   *<a name="line.7"></a>
+<FONT color="green">008</FONT>     * with the License.  You may obtain a copy of the License at   *<a name="line.8"></a>
+<FONT color="green">009</FONT>     *                                                              *<a name="line.9"></a>
+<FONT color="green">010</FONT>     *   http://www.apache.org/licenses/LICENSE-2.0                 *<a name="line.10"></a>
+<FONT color="green">011</FONT>     *                                                              *<a name="line.11"></a>
+<FONT color="green">012</FONT>     * Unless required by applicable law or agreed to in writing,   *<a name="line.12"></a>
+<FONT color="green">013</FONT>     * software distributed under the License is distributed on an  *<a name="line.13"></a>
+<FONT color="green">014</FONT>     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *<a name="line.14"></a>
+<FONT color="green">015</FONT>     * KIND, either express or implied.  See the License for the    *<a name="line.15"></a>
+<FONT color="green">016</FONT>     * specific language governing permissions and limitations      *<a name="line.16"></a>
+<FONT color="green">017</FONT>     * under the License.                                           *<a name="line.17"></a>
+<FONT color="green">018</FONT>     ****************************************************************/<a name="line.18"></a>
+<FONT color="green">019</FONT>    <a name="line.19"></a>
+<FONT color="green">020</FONT>    package org.apache.james.ai.classic;<a name="line.20"></a>
+<FONT color="green">021</FONT>    <a name="line.21"></a>
+<FONT color="green">022</FONT>    import java.io.IOException;<a name="line.22"></a>
+<FONT color="green">023</FONT>    import java.io.Reader;<a name="line.23"></a>
+<FONT color="green">024</FONT>    import java.util.ArrayList;<a name="line.24"></a>
+<FONT color="green">025</FONT>    import java.util.Collection;<a name="line.25"></a>
+<FONT color="green">026</FONT>    import java.util.HashMap;<a name="line.26"></a>
+<FONT color="green">027</FONT>    import java.util.HashSet;<a name="line.27"></a>
+<FONT color="green">028</FONT>    import java.util.Iterator;<a name="line.28"></a>
+<FONT color="green">029</FONT>    import java.util.Map;<a name="line.29"></a>
+<FONT color="green">030</FONT>    import java.util.Set;<a name="line.30"></a>
+<FONT color="green">031</FONT>    import java.util.SortedSet;<a name="line.31"></a>
+<FONT color="green">032</FONT>    import java.util.TreeSet;<a name="line.32"></a>
+<FONT color="green">033</FONT>    <a name="line.33"></a>
+<FONT color="green">034</FONT>    /**<a name="line.34"></a>
+<FONT color="green">035</FONT>     * &lt;p&gt;<a name="line.35"></a>
+<FONT color="green">036</FONT>     * Determines probability that text contains Spam.<a name="line.36"></a>
+<FONT color="green">037</FONT>     * &lt;/p&gt;<a name="line.37"></a>
+<FONT color="green">038</FONT>     * <a name="line.38"></a>
+<FONT color="green">039</FONT>     * &lt;p&gt;<a name="line.39"></a>
+<FONT color="green">040</FONT>     * Based upon Paul Grahams' &lt;a href="http://www.paulgraham.com/spam.html"&gt;A Plan<a name="line.40"></a>
+<FONT color="green">041</FONT>     * for Spam&lt;/a&gt;. Extended to Paul Grahams' &lt;a<a name="line.41"></a>
+<FONT color="green">042</FONT>     * href="http://paulgraham.com/better.html"&gt;Better Bayesian Filtering&lt;/a&gt;.<a name="line.42"></a>
+<FONT color="green">043</FONT>     * &lt;/p&gt;<a name="line.43"></a>
+<FONT color="green">044</FONT>     * <a name="line.44"></a>
+<FONT color="green">045</FONT>     * &lt;p&gt;<a name="line.45"></a>
+<FONT color="green">046</FONT>     * Sample method usage:<a name="line.46"></a>
+<FONT color="green">047</FONT>     * &lt;/p&gt;<a name="line.47"></a>
+<FONT color="green">048</FONT>     * <a name="line.48"></a>
+<FONT color="green">049</FONT>     * &lt;p&gt;<a name="line.49"></a>
+<FONT color="green">050</FONT>     * Use: void addHam(Reader) and void addSpam(Reader)<a name="line.50"></a>
+<FONT color="green">051</FONT>     * <a name="line.51"></a>
+<FONT color="green">052</FONT>     * methods to build up the Maps of ham &amp; spam tokens/occurrences. Both addHam<a name="line.52"></a>
+<FONT color="green">053</FONT>     * and addSpam assume they're reading one message at a time, if you feed more<a name="line.53"></a>
+<FONT color="green">054</FONT>     * than one message per call, be sure to adjust the appropriate message counter:<a name="line.54"></a>
+<FONT color="green">055</FONT>     * hamMessageCount or spamMessageCount.<a name="line.55"></a>
+<FONT color="green">056</FONT>     * <a name="line.56"></a>
+<FONT color="green">057</FONT>     * Then...<a name="line.57"></a>
+<FONT color="green">058</FONT>     * &lt;/p&gt;<a name="line.58"></a>
+<FONT color="green">059</FONT>     * <a name="line.59"></a>
+<FONT color="green">060</FONT>     * &lt;p&gt;<a name="line.60"></a>
+<FONT color="green">061</FONT>     * Use: void buildCorpus()<a name="line.61"></a>
+<FONT color="green">062</FONT>     * <a name="line.62"></a>
+<FONT color="green">063</FONT>     * to build the final token/probabilities Map.<a name="line.63"></a>
+<FONT color="green">064</FONT>     * <a name="line.64"></a>
+<FONT color="green">065</FONT>     * Use your own methods for persistent storage of either the individual ham/spam<a name="line.65"></a>
+<FONT color="green">066</FONT>     * corpus &amp; message counts, and/or the final corpus.<a name="line.66"></a>
+<FONT color="green">067</FONT>     * <a name="line.67"></a>
+<FONT color="green">068</FONT>     * Then you can...<a name="line.68"></a>
+<FONT color="green">069</FONT>     * &lt;/p&gt;<a name="line.69"></a>
+<FONT color="green">070</FONT>     * <a name="line.70"></a>
+<FONT color="green">071</FONT>     * &lt;p&gt;<a name="line.71"></a>
+<FONT color="green">072</FONT>     * Use: double computeSpamProbability(Reader)<a name="line.72"></a>
+<FONT color="green">073</FONT>     * <a name="line.73"></a>
+<FONT color="green">074</FONT>     * to determine the probability that a particular text contains spam. A returned<a name="line.74"></a>
+<FONT color="green">075</FONT>     * result of 0.9 or above is an indicator that the text was spam.<a name="line.75"></a>
+<FONT color="green">076</FONT>     * &lt;/p&gt;<a name="line.76"></a>
+<FONT color="green">077</FONT>     * <a name="line.77"></a>
+<FONT color="green">078</FONT>     * &lt;p&gt;<a name="line.78"></a>
+<FONT color="green">079</FONT>     * If you use persistent storage, use: void setCorpus(Map)<a name="line.79"></a>
+<FONT color="green">080</FONT>     * <a name="line.80"></a>
+<FONT color="green">081</FONT>     * before calling computeSpamProbability.<a name="line.81"></a>
+<FONT color="green">082</FONT>     * &lt;/p&gt;<a name="line.82"></a>
+<FONT color="green">083</FONT>     * <a name="line.83"></a>
+<FONT color="green">084</FONT>     * @since 2.3.0<a name="line.84"></a>
+<FONT color="green">085</FONT>     */<a name="line.85"></a>
+<FONT color="green">086</FONT>    <a name="line.86"></a>
+<FONT color="green">087</FONT>    public class BayesianAnalyzer {<a name="line.87"></a>
+<FONT color="green">088</FONT>    <a name="line.88"></a>
+<FONT color="green">089</FONT>        /**<a name="line.89"></a>
+<FONT color="green">090</FONT>         * Number of "interesting" tokens to use to compute overall spamminess<a name="line.90"></a>
+<FONT color="green">091</FONT>         * probability.<a name="line.91"></a>
+<FONT color="green">092</FONT>         */<a name="line.92"></a>
+<FONT color="green">093</FONT>        private final static int MAX_INTERESTING_TOKENS = 15;<a name="line.93"></a>
+<FONT color="green">094</FONT>    <a name="line.94"></a>
+<FONT color="green">095</FONT>        /**<a name="line.95"></a>
+<FONT color="green">096</FONT>         * Minimum probability distance from 0.5 to consider a token "interesting"<a name="line.96"></a>
+<FONT color="green">097</FONT>         * to use to compute overall spamminess probability.<a name="line.97"></a>
+<FONT color="green">098</FONT>         */<a name="line.98"></a>
+<FONT color="green">099</FONT>        private final static double INTERESTINGNESS_THRESHOLD = 0.46;<a name="line.99"></a>
+<FONT color="green">100</FONT>    <a name="line.100"></a>
+<FONT color="green">101</FONT>        /**<a name="line.101"></a>
+<FONT color="green">102</FONT>         * Default token probability to use when a token has not been encountered<a name="line.102"></a>
+<FONT color="green">103</FONT>         * before.<a name="line.103"></a>
+<FONT color="green">104</FONT>         */<a name="line.104"></a>
+<FONT color="green">105</FONT>        private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;<a name="line.105"></a>
+<FONT color="green">106</FONT>    <a name="line.106"></a>
+<FONT color="green">107</FONT>        /** Map of ham tokens and their occurrences. */<a name="line.107"></a>
+<FONT color="green">108</FONT>        private Map&lt;String, Integer&gt; hamTokenCounts = new HashMap&lt;String, Integer&gt;();<a name="line.108"></a>
+<FONT color="green">109</FONT>    <a name="line.109"></a>
+<FONT color="green">110</FONT>        /** Map of spam tokens and their occurrences. */<a name="line.110"></a>
+<FONT color="green">111</FONT>        private Map&lt;String, Integer&gt; spamTokenCounts = new HashMap&lt;String, Integer&gt;();<a name="line.111"></a>
+<FONT color="green">112</FONT>    <a name="line.112"></a>
+<FONT color="green">113</FONT>        /** Number of ham messages analyzed. */<a name="line.113"></a>
+<FONT color="green">114</FONT>        private int hamMessageCount = 0;<a name="line.114"></a>
+<FONT color="green">115</FONT>    <a name="line.115"></a>
+<FONT color="green">116</FONT>        /** Number of spam messages analyzed. */<a name="line.116"></a>
+<FONT color="green">117</FONT>        private int spamMessageCount = 0;<a name="line.117"></a>
+<FONT color="green">118</FONT>    <a name="line.118"></a>
+<FONT color="green">119</FONT>        /** Final token/probability corpus. */<a name="line.119"></a>
+<FONT color="green">120</FONT>        private Map&lt;String, Double&gt; corpus = new HashMap&lt;String, Double&gt;();<a name="line.120"></a>
+<FONT color="green">121</FONT>    <a name="line.121"></a>
+<FONT color="green">122</FONT>        /**<a name="line.122"></a>
+<FONT color="green">123</FONT>         * Inner class for managing Token Probability Strengths during the<a name="line.123"></a>
+<FONT color="green">124</FONT>         * computeSpamProbability phase.<a name="line.124"></a>
+<FONT color="green">125</FONT>         * <a name="line.125"></a>
+<FONT color="green">126</FONT>         * By probability &lt;i&gt;strength&lt;/i&gt; we mean the absolute distance of a<a name="line.126"></a>
+<FONT color="green">127</FONT>         * probability from the middle value 0.5.<a name="line.127"></a>
+<FONT color="green">128</FONT>         * <a name="line.128"></a>
+<FONT color="green">129</FONT>         * It implements Comparable so that it's sorting is automatic.<a name="line.129"></a>
+<FONT color="green">130</FONT>         */<a name="line.130"></a>
+<FONT color="green">131</FONT>        private class TokenProbabilityStrength implements Comparable&lt;TokenProbabilityStrength&gt; {<a name="line.131"></a>
+<FONT color="green">132</FONT>            /**<a name="line.132"></a>
+<FONT color="green">133</FONT>             * Message token.<a name="line.133"></a>
+<FONT color="green">134</FONT>             */<a name="line.134"></a>
+<FONT color="green">135</FONT>            String token = null;<a name="line.135"></a>
+<FONT color="green">136</FONT>    <a name="line.136"></a>
+<FONT color="green">137</FONT>            /**<a name="line.137"></a>
+<FONT color="green">138</FONT>             * Token's computed probability strength.<a name="line.138"></a>
+<FONT color="green">139</FONT>             */<a name="line.139"></a>
+<FONT color="green">140</FONT>            double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);<a name="line.140"></a>
+<FONT color="green">141</FONT>    <a name="line.141"></a>
+<FONT color="green">142</FONT>            /**<a name="line.142"></a>
+<FONT color="green">143</FONT>             * Force the natural sort order for this object to be high-to-low.<a name="line.143"></a>
+<FONT color="green">144</FONT>             * <a name="line.144"></a>
+<FONT color="green">145</FONT>             * @param anotherTokenProbabilityStrength<a name="line.145"></a>
+<FONT color="green">146</FONT>             *            A TokenProbabilityStrength instance to compare this<a name="line.146"></a>
+<FONT color="green">147</FONT>             *            instance with.<a name="line.147"></a>
+<FONT color="green">148</FONT>             * <a name="line.148"></a>
+<FONT color="green">149</FONT>             * @return The result of the comparison (before, equal, after).<a name="line.149"></a>
+<FONT color="green">150</FONT>             */<a name="line.150"></a>
+<FONT color="green">151</FONT>            public final int compareTo(TokenProbabilityStrength anotherTokenProbabilityStrength) {<a name="line.151"></a>
+<FONT color="green">152</FONT>                int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);<a name="line.152"></a>
+<FONT color="green">153</FONT>                if (result == 0) {<a name="line.153"></a>
+<FONT color="green">154</FONT>                    return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);<a name="line.154"></a>
+<FONT color="green">155</FONT>                } else {<a name="line.155"></a>
+<FONT color="green">156</FONT>                    return result;<a name="line.156"></a>
+<FONT color="green">157</FONT>                }<a name="line.157"></a>
+<FONT color="green">158</FONT>            }<a name="line.158"></a>
+<FONT color="green">159</FONT>    <a name="line.159"></a>
+<FONT color="green">160</FONT>            /**<a name="line.160"></a>
+<FONT color="green">161</FONT>             * Simple toString () implementation mostly for debugging purposes.<a name="line.161"></a>
+<FONT color="green">162</FONT>             * <a name="line.162"></a>
+<FONT color="green">163</FONT>             * @return String representation of this object.<a name="line.163"></a>
+<FONT color="green">164</FONT>             */<a name="line.164"></a>
+<FONT color="green">165</FONT>            public String toString() {<a name="line.165"></a>
+<FONT color="green">166</FONT>                StringBuffer sb = new StringBuffer(30);<a name="line.166"></a>
+<FONT color="green">167</FONT>    <a name="line.167"></a>
+<FONT color="green">168</FONT>                sb.append(token).append("=").append(strength);<a name="line.168"></a>
+<FONT color="green">169</FONT>    <a name="line.169"></a>
+<FONT color="green">170</FONT>                return sb.toString();<a name="line.170"></a>
+<FONT color="green">171</FONT>            }<a name="line.171"></a>
+<FONT color="green">172</FONT>        }<a name="line.172"></a>
+<FONT color="green">173</FONT>    <a name="line.173"></a>
+<FONT color="green">174</FONT>        /**<a name="line.174"></a>
+<FONT color="green">175</FONT>         * Basic class constructor.<a name="line.175"></a>
+<FONT color="green">176</FONT>         */<a name="line.176"></a>
+<FONT color="green">177</FONT>        public BayesianAnalyzer() {<a name="line.177"></a>
+<FONT color="green">178</FONT>        }<a name="line.178"></a>
+<FONT color="green">179</FONT>    <a name="line.179"></a>
+<FONT color="green">180</FONT>        /**<a name="line.180"></a>
+<FONT color="green">181</FONT>         * Public setter for the hamTokenCounts Map.<a name="line.181"></a>
+<FONT color="green">182</FONT>         * <a name="line.182"></a>
+<FONT color="green">183</FONT>         * @param hamTokenCounts<a name="line.183"></a>
+<FONT color="green">184</FONT>         *            The new ham Token counts Map.<a name="line.184"></a>
+<FONT color="green">185</FONT>         */<a name="line.185"></a>
+<FONT color="green">186</FONT>        public void setHamTokenCounts(Map&lt;String, Integer&gt; hamTokenCounts) {<a name="line.186"></a>
+<FONT color="green">187</FONT>            this.hamTokenCounts = hamTokenCounts;<a name="line.187"></a>
+<FONT color="green">188</FONT>        }<a name="line.188"></a>
+<FONT color="green">189</FONT>    <a name="line.189"></a>
+<FONT color="green">190</FONT>        /**<a name="line.190"></a>
+<FONT color="green">191</FONT>         * Public getter for the hamTokenCounts Map.<a name="line.191"></a>
+<FONT color="green">192</FONT>         */<a name="line.192"></a>
+<FONT color="green">193</FONT>        public Map&lt;String, Integer&gt; getHamTokenCounts() {<a name="line.193"></a>
+<FONT color="green">194</FONT>            return this.hamTokenCounts;<a name="line.194"></a>
+<FONT color="green">195</FONT>        }<a name="line.195"></a>
+<FONT color="green">196</FONT>    <a name="line.196"></a>
+<FONT color="green">197</FONT>        /**<a name="line.197"></a>
+<FONT color="green">198</FONT>         * Public setter for the spamTokenCounts Map.<a name="line.198"></a>
+<FONT color="green">199</FONT>         * <a name="line.199"></a>
+<FONT color="green">200</FONT>         * @param spamTokenCounts<a name="line.200"></a>
+<FONT color="green">201</FONT>         *            The new spam Token counts Map.<a name="line.201"></a>
+<FONT color="green">202</FONT>         */<a name="line.202"></a>
+<FONT color="green">203</FONT>        public void setSpamTokenCounts(Map&lt;String, Integer&gt; spamTokenCounts) {<a name="line.203"></a>
+<FONT color="green">204</FONT>            this.spamTokenCounts = spamTokenCounts;<a name="line.204"></a>
+<FONT color="green">205</FONT>        }<a name="line.205"></a>
+<FONT color="green">206</FONT>    <a name="line.206"></a>
+<FONT color="green">207</FONT>        /**<a name="line.207"></a>
+<FONT color="green">208</FONT>         * Public getter for the spamTokenCounts Map.<a name="line.208"></a>
+<FONT color="green">209</FONT>         */<a name="line.209"></a>
+<FONT color="green">210</FONT>        public Map&lt;String, Integer&gt; getSpamTokenCounts() {<a name="line.210"></a>
+<FONT color="green">211</FONT>            return this.spamTokenCounts;<a name="line.211"></a>
+<FONT color="green">212</FONT>        }<a name="line.212"></a>
+<FONT color="green">213</FONT>    <a name="line.213"></a>
+<FONT color="green">214</FONT>        /**<a name="line.214"></a>
+<FONT color="green">215</FONT>         * Public setter for spamMessageCount.<a name="line.215"></a>
+<FONT color="green">216</FONT>         * <a name="line.216"></a>
+<FONT color="green">217</FONT>         * @param spamMessageCount<a name="line.217"></a>
+<FONT color="green">218</FONT>         *            The new spam message count.<a name="line.218"></a>
+<FONT color="green">219</FONT>         */<a name="line.219"></a>
+<FONT color="green">220</FONT>        public void setSpamMessageCount(int spamMessageCount) {<a name="line.220"></a>
+<FONT color="green">221</FONT>            this.spamMessageCount = spamMessageCount;<a name="line.221"></a>
+<FONT color="green">222</FONT>        }<a name="line.222"></a>
+<FONT color="green">223</FONT>    <a name="line.223"></a>
+<FONT color="green">224</FONT>        /**<a name="line.224"></a>
+<FONT color="green">225</FONT>         * Public getter for spamMessageCount.<a name="line.225"></a>
+<FONT color="green">226</FONT>         */<a name="line.226"></a>
+<FONT color="green">227</FONT>        public int getSpamMessageCount() {<a name="line.227"></a>
+<FONT color="green">228</FONT>            return this.spamMessageCount;<a name="line.228"></a>
+<FONT color="green">229</FONT>        }<a name="line.229"></a>
+<FONT color="green">230</FONT>    <a name="line.230"></a>
+<FONT color="green">231</FONT>        /**<a name="line.231"></a>
+<FONT color="green">232</FONT>         * Public setter for hamMessageCount.<a name="line.232"></a>
+<FONT color="green">233</FONT>         * <a name="line.233"></a>
+<FONT color="green">234</FONT>         * @param hamMessageCount<a name="line.234"></a>
+<FONT color="green">235</FONT>         *            The new ham message count.<a name="line.235"></a>
+<FONT color="green">236</FONT>         */<a name="line.236"></a>
+<FONT color="green">237</FONT>        public void setHamMessageCount(int hamMessageCount) {<a name="line.237"></a>
+<FONT color="green">238</FONT>            this.hamMessageCount = hamMessageCount;<a name="line.238"></a>
+<FONT color="green">239</FONT>        }<a name="line.239"></a>
+<FONT color="green">240</FONT>    <a name="line.240"></a>
+<FONT color="green">241</FONT>        /**<a name="line.241"></a>
+<FONT color="green">242</FONT>         * Public getter for hamMessageCount.<a name="line.242"></a>
+<FONT color="green">243</FONT>         */<a name="line.243"></a>
+<FONT color="green">244</FONT>        public int getHamMessageCount() {<a name="line.244"></a>
+<FONT color="green">245</FONT>            return this.hamMessageCount;<a name="line.245"></a>
+<FONT color="green">246</FONT>        }<a name="line.246"></a>
+<FONT color="green">247</FONT>    <a name="line.247"></a>
+<FONT color="green">248</FONT>        /**<a name="line.248"></a>
+<FONT color="green">249</FONT>         * Clears all analysis repositories and counters.<a name="line.249"></a>
+<FONT color="green">250</FONT>         */<a name="line.250"></a>
+<FONT color="green">251</FONT>        public void clear() {<a name="line.251"></a>
+<FONT color="green">252</FONT>            corpus.clear();<a name="line.252"></a>
+<FONT color="green">253</FONT>    <a name="line.253"></a>
+<FONT color="green">254</FONT>            tokenCountsClear();<a name="line.254"></a>
+<FONT color="green">255</FONT>    <a name="line.255"></a>
+<FONT color="green">256</FONT>            hamMessageCount = 0;<a name="line.256"></a>
+<FONT color="green">257</FONT>            spamMessageCount = 0;<a name="line.257"></a>
+<FONT color="green">258</FONT>        }<a name="line.258"></a>
+<FONT color="green">259</FONT>    <a name="line.259"></a>
+<FONT color="green">260</FONT>        /**<a name="line.260"></a>
+<FONT color="green">261</FONT>         * Clears token counters.<a name="line.261"></a>
+<FONT color="green">262</FONT>         */<a name="line.262"></a>
+<FONT color="green">263</FONT>        public void tokenCountsClear() {<a name="line.263"></a>
+<FONT color="green">264</FONT>            hamTokenCounts.clear();<a name="line.264"></a>
+<FONT color="green">265</FONT>            spamTokenCounts.clear();<a name="line.265"></a>
+<FONT color="green">266</FONT>        }<a name="line.266"></a>
+<FONT color="green">267</FONT>    <a name="line.267"></a>
+<FONT color="green">268</FONT>        /**<a name="line.268"></a>
+<FONT color="green">269</FONT>         * Public setter for corpus.<a name="line.269"></a>
+<FONT color="green">270</FONT>         * <a name="line.270"></a>
+<FONT color="green">271</FONT>         * @param corpus<a name="line.271"></a>
+<FONT color="green">272</FONT>         *            The new corpus.<a name="line.272"></a>
+<FONT color="green">273</FONT>         */<a name="line.273"></a>
+<FONT color="green">274</FONT>        public void setCorpus(Map&lt;String, Double&gt; corpus) {<a name="line.274"></a>
+<FONT color="green">275</FONT>            this.corpus = corpus;<a name="line.275"></a>
+<FONT color="green">276</FONT>        }<a name="line.276"></a>
+<FONT color="green">277</FONT>    <a name="line.277"></a>
+<FONT color="green">278</FONT>        /**<a name="line.278"></a>
+<FONT color="green">279</FONT>         * Public getter for corpus.<a name="line.279"></a>
+<FONT color="green">280</FONT>         */<a name="line.280"></a>
+<FONT color="green">281</FONT>        public Map&lt;String, Double&gt; getCorpus() {<a name="line.281"></a>
+<FONT color="green">282</FONT>            return this.corpus;<a name="line.282"></a>
+<FONT color="green">283</FONT>        }<a name="line.283"></a>
+<FONT color="green">284</FONT>    <a name="line.284"></a>
+<FONT color="green">285</FONT>        /**<a name="line.285"></a>
+<FONT color="green">286</FONT>         * Builds the corpus from the existing ham &amp; spam counts.<a name="line.286"></a>
+<FONT color="green">287</FONT>         */<a name="line.287"></a>
+<FONT color="green">288</FONT>        public void buildCorpus() {<a name="line.288"></a>
+<FONT color="green">289</FONT>            // Combine the known ham &amp; spam tokens.<a name="line.289"></a>
+<FONT color="green">290</FONT>            Set&lt;String&gt; set = new HashSet&lt;String&gt;(hamTokenCounts.size() + spamTokenCounts.size());<a name="line.290"></a>
+<FONT color="green">291</FONT>            set.addAll(hamTokenCounts.keySet());<a name="line.291"></a>
+<FONT color="green">292</FONT>            set.addAll(spamTokenCounts.keySet());<a name="line.292"></a>
+<FONT color="green">293</FONT>            Map&lt;String, Double&gt; tempCorpus = new HashMap&lt;String, Double&gt;(set.size());<a name="line.293"></a>
+<FONT color="green">294</FONT>    <a name="line.294"></a>
+<FONT color="green">295</FONT>            // Iterate through all the tokens and compute their new<a name="line.295"></a>
+<FONT color="green">296</FONT>            // individual probabilities.<a name="line.296"></a>
+<FONT color="green">297</FONT>            Iterator&lt;String&gt; i = set.iterator();<a name="line.297"></a>
+<FONT color="green">298</FONT>            while (i.hasNext()) {<a name="line.298"></a>
+<FONT color="green">299</FONT>                String token = i.next();<a name="line.299"></a>
+<FONT color="green">300</FONT>                tempCorpus.put(token, new Double(computeProbability(token)));<a name="line.300"></a>
+<FONT color="green">301</FONT>            }<a name="line.301"></a>
+<FONT color="green">302</FONT>            setCorpus(tempCorpus);<a name="line.302"></a>
+<FONT color="green">303</FONT>        }<a name="line.303"></a>
+<FONT color="green">304</FONT>    <a name="line.304"></a>
+<FONT color="green">305</FONT>        /**<a name="line.305"></a>
+<FONT color="green">306</FONT>         * Adds a message to the ham list.<a name="line.306"></a>
+<FONT color="green">307</FONT>         * <a name="line.307"></a>
+<FONT color="green">308</FONT>         * @param stream<a name="line.308"></a>
+<FONT color="green">309</FONT>         *            A reader stream on the ham message to analyze<a name="line.309"></a>
+<FONT color="green">310</FONT>         * @throws IOException<a name="line.310"></a>
+<FONT color="green">311</FONT>         *             If any error occurs<a name="line.311"></a>
+<FONT color="green">312</FONT>         */<a name="line.312"></a>
+<FONT color="green">313</FONT>        public void addHam(Reader stream) throws java.io.IOException {<a name="line.313"></a>
+<FONT color="green">314</FONT>            addTokenOccurrences(stream, hamTokenCounts);<a name="line.314"></a>
+<FONT color="green">315</FONT>            hamMessageCount++;<a name="line.315"></a>
+<FONT color="green">316</FONT>        }<a name="line.316"></a>
+<FONT color="green">317</FONT>    <a name="line.317"></a>
+<FONT color="green">318</FONT>        /**<a name="line.318"></a>
+<FONT color="green">319</FONT>         * Adds a message to the spam list.<a name="line.319"></a>
+<FONT color="green">320</FONT>         * <a name="line.320"></a>
+<FONT color="green">321</FONT>         * @param stream<a name="line.321"></a>
+<FONT color="green">322</FONT>         *            A reader stream on the spam message to analyze<a name="line.322"></a>
+<FONT color="green">323</FONT>         * @throws IOException<a name="line.323"></a>
+<FONT color="green">324</FONT>         *             If any error occurs<a name="line.324"></a>
+<FONT color="green">325</FONT>         */<a name="line.325"></a>
+<FONT color="green">326</FONT>        public void addSpam(Reader stream) throws java.io.IOException {<a name="line.326"></a>
+<FONT color="green">327</FONT>            addTokenOccurrences(stream, spamTokenCounts);<a name="line.327"></a>
+<FONT color="green">328</FONT>            spamMessageCount++;<a name="line.328"></a>
+<FONT color="green">329</FONT>        }<a name="line.329"></a>
+<FONT color="green">330</FONT>    <a name="line.330"></a>
+<FONT color="green">331</FONT>        /**<a name="line.331"></a>
+<FONT color="green">332</FONT>         * Computes the probability that the stream contains SPAM.<a name="line.332"></a>
+<FONT color="green">333</FONT>         * <a name="line.333"></a>
+<FONT color="green">334</FONT>         * @param stream<a name="line.334"></a>
+<FONT color="green">335</FONT>         *            The text to be analyzed for Spamminess.<a name="line.335"></a>
+<FONT color="green">336</FONT>         * @return A 0.0 - 1.0 probability<a name="line.336"></a>
+<FONT color="green">337</FONT>         * @throws IOException<a name="line.337"></a>
+<FONT color="green">338</FONT>         *             If any error occurs<a name="line.338"></a>
+<FONT color="green">339</FONT>         */<a name="line.339"></a>
+<FONT color="green">340</FONT>        public double computeSpamProbability(Reader stream) throws java.io.IOException {<a name="line.340"></a>
+<FONT color="green">341</FONT>            // Build a set of the tokens in the Stream.<a name="line.341"></a>
+<FONT color="green">342</FONT>            Set&lt;String&gt; tokens = parse(stream);<a name="line.342"></a>
+<FONT color="green">343</FONT>    <a name="line.343"></a>
+<FONT color="green">344</FONT>            // Get the corpus to use in this run<a name="line.344"></a>
+<FONT color="green">345</FONT>            // A new corpus may be being built in the meantime<a name="line.345"></a>
+<FONT color="green">346</FONT>            Map&lt;String, Double&gt; workCorpus = getCorpus();<a name="line.346"></a>
+<FONT color="green">347</FONT>    <a name="line.347"></a>
+<FONT color="green">348</FONT>            // Assign their probabilities from the Corpus (using an additional<a name="line.348"></a>
+<FONT color="green">349</FONT>            // calculation to determine spamminess).<a name="line.349"></a>
+<FONT color="green">350</FONT>            SortedSet&lt;TokenProbabilityStrength&gt; tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);<a name="line.350"></a>
+<FONT color="green">351</FONT>    <a name="line.351"></a>
+<FONT color="green">352</FONT>            // Compute and return the overall probability that the<a name="line.352"></a>
+<FONT color="green">353</FONT>            // stream is SPAM.<a name="line.353"></a>
+<FONT color="green">354</FONT>            return computeOverallProbability(tokenProbabilityStrengths, workCorpus);<a name="line.354"></a>
+<FONT color="green">355</FONT>        }<a name="line.355"></a>
+<FONT color="green">356</FONT>    <a name="line.356"></a>
+<FONT color="green">357</FONT>        /**<a name="line.357"></a>
+<FONT color="green">358</FONT>         * Parses a stream into tokens, and updates the target Map with the<a name="line.358"></a>
+<FONT color="green">359</FONT>         * token/counts.<a name="line.359"></a>
+<FONT color="green">360</FONT>         * <a name="line.360"></a>
+<FONT color="green">361</FONT>         * @param stream<a name="line.361"></a>
+<FONT color="green">362</FONT>         * @param target<a name="line.362"></a>
+<FONT color="green">363</FONT>         */<a name="line.363"></a>
+<FONT color="green">364</FONT>        private void addTokenOccurrences(Reader stream, Map&lt;String, Integer&gt; target) throws java.io.IOException {<a name="line.364"></a>
+<FONT color="green">365</FONT>            String token;<a name="line.365"></a>
+<FONT color="green">366</FONT>            String header = "";<a name="line.366"></a>
+<FONT color="green">367</FONT>    <a name="line.367"></a>
+<FONT color="green">368</FONT>            // Update target with the tokens/count encountered.<a name="line.368"></a>
+<FONT color="green">369</FONT>            while ((token = nextToken(stream)) != null) {<a name="line.369"></a>
+<FONT color="green">370</FONT>                boolean endingLine = false;<a name="line.370"></a>
+<FONT color="green">371</FONT>                if (token.length() &gt; 0 &amp;&amp; token.charAt(token.length() - 1) == '\n') {<a name="line.371"></a>
+<FONT color="green">372</FONT>                    endingLine = true;<a name="line.372"></a>
+<FONT color="green">373</FONT>                    token = token.substring(0, token.length() - 1);<a name="line.373"></a>
+<FONT color="green">374</FONT>                }<a name="line.374"></a>
+<FONT color="green">375</FONT>    <a name="line.375"></a>
+<FONT color="green">376</FONT>                if (token.length() &gt; 0 &amp;&amp; header.length() + token.length() &lt; 90 &amp;&amp; !allDigits(token)) {<a name="line.376"></a>
+<FONT color="green">377</FONT>                    if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {<a name="line.377"></a>
+<FONT color="green">378</FONT>                        header = token;<a name="line.378"></a>
+<FONT color="green">379</FONT>                        if (!endingLine) {<a name="line.379"></a>
+<FONT color="green">380</FONT>                            continue;<a name="line.380"></a>
+<FONT color="green">381</FONT>                        }<a name="line.381"></a>
+<FONT color="green">382</FONT>                    }<a name="line.382"></a>
+<FONT color="green">383</FONT>    <a name="line.383"></a>
+<FONT color="green">384</FONT>                    token = header + token;<a name="line.384"></a>
+<FONT color="green">385</FONT>    <a name="line.385"></a>
+<FONT color="green">386</FONT>                    Integer value = null;<a name="line.386"></a>
+<FONT color="green">387</FONT>    <a name="line.387"></a>
+<FONT color="green">388</FONT>                    if (target.containsKey(token)) {<a name="line.388"></a>
+<FONT color="green">389</FONT>                        value = Integer.valueOf(((Integer) target.get(token)).intValue() + 1);<a name="line.389"></a>
+<FONT color="green">390</FONT>                    } else {<a name="line.390"></a>
+<FONT color="green">391</FONT>                        value = Integer.valueOf(1);<a name="line.391"></a>
+<FONT color="green">392</FONT>                    }<a name="line.392"></a>
+<FONT color="green">393</FONT>    <a name="line.393"></a>
+<FONT color="green">394</FONT>                    target.put(token, value);<a name="line.394"></a>
+<FONT color="green">395</FONT>                }<a name="line.395"></a>
+<FONT color="green">396</FONT>    <a name="line.396"></a>
+<FONT color="green">397</FONT>                if (endingLine) {<a name="line.397"></a>
+<FONT color="green">398</FONT>                    header = "";<a name="line.398"></a>
+<FONT color="green">399</FONT>                }<a name="line.399"></a>
+<FONT color="green">400</FONT>            }<a name="line.400"></a>
+<FONT color="green">401</FONT>        }<a name="line.401"></a>
+<FONT color="green">402</FONT>    <a name="line.402"></a>
+<FONT color="green">403</FONT>        /**<a name="line.403"></a>
+<FONT color="green">404</FONT>         * Parses a stream into tokens, and returns a Set of the unique tokens<a name="line.404"></a>
+<FONT color="green">405</FONT>         * encountered.<a name="line.405"></a>
+<FONT color="green">406</FONT>         * <a name="line.406"></a>
+<FONT color="green">407</FONT>         * @param stream<a name="line.407"></a>
+<FONT color="green">408</FONT>         * @return Set<a name="line.408"></a>
+<FONT color="green">409</FONT>         */<a name="line.409"></a>
+<FONT color="green">410</FONT>        private Set&lt;String&gt; parse(Reader stream) throws java.io.IOException {<a name="line.410"></a>
+<FONT color="green">411</FONT>            Set&lt;String&gt; tokens = new HashSet&lt;String&gt;();<a name="line.411"></a>
+<FONT color="green">412</FONT>            String token;<a name="line.412"></a>
+<FONT color="green">413</FONT>            String header = "";<a name="line.413"></a>
+<FONT color="green">414</FONT>    <a name="line.414"></a>
+<FONT color="green">415</FONT>            // Build a Map of tokens encountered.<a name="line.415"></a>
+<FONT color="green">416</FONT>            while ((token = nextToken(stream)) != null) {<a name="line.416"></a>
+<FONT color="green">417</FONT>                boolean endingLine = false;<a name="line.417"></a>
+<FONT color="green">418</FONT>                if (token.length() &gt; 0 &amp;&amp; token.charAt(token.length() - 1) == '\n') {<a name="line.418"></a>
+<FONT color="green">419</FONT>                    endingLine = true;<a name="line.419"></a>
+<FONT color="green">420</FONT>                    token = token.substring(0, token.length() - 1);<a name="line.420"></a>
+<FONT color="green">421</FONT>                }<a name="line.421"></a>
+<FONT color="green">422</FONT>    <a name="line.422"></a>
+<FONT color="green">423</FONT>                if (token.length() &gt; 0 &amp;&amp; header.length() + token.length() &lt; 90 &amp;&amp; !allDigits(token)) {<a name="line.423"></a>
+<FONT color="green">424</FONT>                    if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {<a name="line.424"></a>
+<FONT color="green">425</FONT>                        header = token;<a name="line.425"></a>
+<FONT color="green">426</FONT>                        if (!endingLine) {<a name="line.426"></a>
+<FONT color="green">427</FONT>                            continue;<a name="line.427"></a>
+<FONT color="green">428</FONT>                        }<a name="line.428"></a>
+<FONT color="green">429</FONT>                    }<a name="line.429"></a>
+<FONT color="green">430</FONT>    <a name="line.430"></a>
+<FONT color="green">431</FONT>                    token = header + token;<a name="line.431"></a>
+<FONT color="green">432</FONT>    <a name="line.432"></a>
+<FONT color="green">433</FONT>                    tokens.add(token);<a name="line.433"></a>
+<FONT color="green">434</FONT>                }<a name="line.434"></a>
+<FONT color="green">435</FONT>    <a name="line.435"></a>
+<FONT color="green">436</FONT>                if (endingLine) {<a name="line.436"></a>
+<FONT color="green">437</FONT>                    header = "";<a name="line.437"></a>
+<FONT color="green">438</FONT>                }<a name="line.438"></a>
+<FONT color="green">439</FONT>            }<a name="line.439"></a>
+<FONT color="green">440</FONT>    <a name="line.440"></a>
+<FONT color="green">441</FONT>            // Return the unique set of tokens encountered.<a name="line.441"></a>
+<FONT color="green">442</FONT>            return tokens;<a name="line.442"></a>
+<FONT color="green">443</FONT>        }<a name="line.443"></a>
+<FONT color="green">444</FONT>    <a name="line.444"></a>
+<FONT color="green">445</FONT>        private String nextToken(Reader reader) throws java.io.IOException {<a name="line.445"></a>
+<FONT color="green">446</FONT>            StringBuffer token = new StringBuffer();<a name="line.446"></a>
+<FONT color="green">447</FONT>            int i;<a name="line.447"></a>
+<FONT color="green">448</FONT>            char ch, ch2;<a name="line.448"></a>
+<FONT color="green">449</FONT>            boolean previousWasDigit = false;<a name="line.449"></a>
+<FONT color="green">450</FONT>            boolean tokenCharFound = false;<a name="line.450"></a>
+<FONT color="green">451</FONT>    <a name="line.451"></a>
+<FONT color="green">452</FONT>            if (!reader.ready()) {<a name="line.452"></a>
+<FONT color="green">453</FONT>                return null;<a name="line.453"></a>
+<FONT color="green">454</FONT>            }<a name="line.454"></a>
+<FONT color="green">455</FONT>    <a name="line.455"></a>
+<FONT color="green">456</FONT>            while ((i = reader.read()) != -1) {<a name="line.456"></a>
+<FONT color="green">457</FONT>    <a name="line.457"></a>
+<FONT color="green">458</FONT>                ch = (char) i;<a name="line.458"></a>
+<FONT color="green">459</FONT>    <a name="line.459"></a>
+<FONT color="green">460</FONT>                if (ch == ':') {<a name="line.460"></a>
+<FONT color="green">461</FONT>                    String tokenString = token.toString() + ':';<a name="line.461"></a>
+<FONT color="green">462</FONT>                    if (tokenString.equals("From:") || tokenString.equals("Return-Path:") || tokenString.equals("Subject:") || tokenString.equals("To:")) {<a name="line.462"></a>
+<FONT color="green">463</FONT>                        return tokenString;<a name="line.463"></a>
+<FONT color="green">464</FONT>                    }<a name="line.464"></a>
+<FONT color="green">465</FONT>                }<a name="line.465"></a>
+<FONT color="green">466</FONT>    <a name="line.466"></a>
+<FONT color="green">467</FONT>                if (Character.isLetter(ch) || ch == '-' || ch == '$' || ch == '\u20AC' // the<a name="line.467"></a>
+<FONT color="green">468</FONT>                                                                                       // EURO<a name="line.468"></a>
+<FONT color="green">469</FONT>                                                                                       // symbol<a name="line.469"></a>
+<FONT color="green">470</FONT>                        || ch == '!' || ch == '\'') {<a name="line.470"></a>
+<FONT color="green">471</FONT>                    tokenCharFound = true;<a name="line.471"></a>
+<FONT color="green">472</FONT>                    previousWasDigit = false;<a name="line.472"></a>
+<FONT color="green">473</FONT>                    token.append(ch);<a name="line.473"></a>
+<FONT color="green">474</FONT>                } else if (Character.isDigit(ch)) {<a name="line.474"></a>
+<FONT color="green">475</FONT>                    tokenCharFound = true;<a name="line.475"></a>
+<FONT color="green">476</FONT>                    previousWasDigit = true;<a name="line.476"></a>
+<FONT color="green">477</FONT>                    token.append(ch);<a name="line.477"></a>
+<FONT color="green">478</FONT>                } else if (previousWasDigit &amp;&amp; (ch == '.' || ch == ',')) {<a name="line.478"></a>
+<FONT color="green">479</FONT>                    reader.mark(1);<a name="line.479"></a>
+<FONT color="green">480</FONT>                    previousWasDigit = false;<a name="line.480"></a>
+<FONT color="green">481</FONT>                    i = reader.read();<a name="line.481"></a>
+<FONT color="green">482</FONT>                    if (i == -1) {<a name="line.482"></a>
+<FONT color="green">483</FONT>                        break;<a name="line.483"></a>
+<FONT color="green">484</FONT>                    }<a name="line.484"></a>
+<FONT color="green">485</FONT>                    ch2 = (char) i;<a name="line.485"></a>
+<FONT color="green">486</FONT>                    if (Character.isDigit(ch2)) {<a name="line.486"></a>
+<FONT color="green">487</FONT>                        tokenCharFound = true;<a name="line.487"></a>
+<FONT color="green">488</FONT>                        previousWasDigit = true;<a name="line.488"></a>
+<FONT color="green">489</FONT>                        token.append(ch);<a name="line.489"></a>
+<FONT color="green">490</FONT>                        token.append(ch2);<a name="line.490"></a>
+<FONT color="green">491</FONT>                    } else {<a name="line.491"></a>
+<FONT color="green">492</FONT>                        reader.reset();<a name="line.492"></a>
+<FONT color="green">493</FONT>                        break;<a name="line.493"></a>
+<FONT color="green">494</FONT>                    }<a name="line.494"></a>
+<FONT color="green">495</FONT>                } else if (ch == '\r') {<a name="line.495"></a>
+<FONT color="green">496</FONT>                    // cr found, ignore<a name="line.496"></a>
+<FONT color="green">497</FONT>                } else if (ch == '\n') {<a name="line.497"></a>
+<FONT color="green">498</FONT>                    // eol found<a name="line.498"></a>
+<FONT color="green">499</FONT>                    tokenCharFound = true;<a name="line.499"></a>
+<FONT color="green">500</FONT>                    previousWasDigit = false;<a name="line.500"></a>
+<FONT color="green">501</FONT>                    token.append(ch);<a name="line.501"></a>
+<FONT color="green">502</FONT>                    break;<a name="line.502"></a>
+<FONT color="green">503</FONT>                } else if (tokenCharFound) {<a name="line.503"></a>
+<FONT color="green">504</FONT>                    break;<a name="line.504"></a>
+<FONT color="green">505</FONT>                }<a name="line.505"></a>
+<FONT color="green">506</FONT>            }<a name="line.506"></a>
+<FONT color="green">507</FONT>    <a name="line.507"></a>
+<FONT color="green">508</FONT>            if (tokenCharFound) {<a name="line.508"></a>
+<FONT color="green">509</FONT>                // System.out.println("Token read: " + token);<a name="line.509"></a>
+<FONT color="green">510</FONT>                return token.toString();<a name="line.510"></a>
+<FONT color="green">511</FONT>            } else {<a name="line.511"></a>
+<FONT color="green">512</FONT>                return null;<a name="line.512"></a>
+<FONT color="green">513</FONT>            }<a name="line.513"></a>
+<FONT color="green">514</FONT>        }<a name="line.514"></a>
+<FONT color="green">515</FONT>    <a name="line.515"></a>
+<FONT color="green">516</FONT>        /**<a name="line.516"></a>
+<FONT color="green">517</FONT>         * Compute the probability that "token" is SPAM.<a name="line.517"></a>
+<FONT color="green">518</FONT>         * <a name="line.518"></a>
+<FONT color="green">519</FONT>         * @param token<a name="line.519"></a>
+<FONT color="green">520</FONT>         * @return The probability that the token occurs within spam.<a name="line.520"></a>
+<FONT color="green">521</FONT>         */<a name="line.521"></a>
+<FONT color="green">522</FONT>        private double computeProbability(String token) {<a name="line.522"></a>
+<FONT color="green">523</FONT>            double hamFactor = 0;<a name="line.523"></a>
+<FONT color="green">524</FONT>            double spamFactor = 0;<a name="line.524"></a>
+<FONT color="green">525</FONT>    <a name="line.525"></a>
+<FONT color="green">526</FONT>            boolean foundInHam = false;<a name="line.526"></a>
+<FONT color="green">527</FONT>            boolean foundInSpam = false;<a name="line.527"></a>
+<FONT color="green">528</FONT>    <a name="line.528"></a>
+<FONT color="green">529</FONT>            double minThreshold = 0.01;<a name="line.529"></a>
+<FONT color="green">530</FONT>            double maxThreshold = 0.99;<a name="line.530"></a>
+<FONT color="green">531</FONT>    <a name="line.531"></a>
+<FONT color="green">532</FONT>            if (hamTokenCounts.containsKey(token)) {<a name="line.532"></a>
+<FONT color="green">533</FONT>                foundInHam = true;<a name="line.533"></a>
+<FONT color="green">534</FONT>            }<a name="line.534"></a>
+<FONT color="green">535</FONT>    <a name="line.535"></a>
+<FONT color="green">536</FONT>            if (spamTokenCounts.containsKey(token)) {<a name="line.536"></a>
+<FONT color="green">537</FONT>                foundInSpam = true;<a name="line.537"></a>
+<FONT color="green">538</FONT>            }<a name="line.538"></a>
+<FONT color="green">539</FONT>    <a name="line.539"></a>
+<FONT color="green">540</FONT>            if (foundInHam) {<a name="line.540"></a>
+<FONT color="green">541</FONT>                hamFactor = 2 * ((Integer) hamTokenCounts.get(token)).doubleValue();<a name="line.541"></a>
+<FONT color="green">542</FONT>                if (!foundInSpam) {<a name="line.542"></a>
+<FONT color="green">543</FONT>                    minThreshold = (hamFactor &gt; 20) ? 0.0001 : 0.0002;<a name="line.543"></a>
+<FONT color="green">544</FONT>                }<a name="line.544"></a>
+<FONT color="green">545</FONT>            }<a name="line.545"></a>
+<FONT color="green">546</FONT>    <a name="line.546"></a>
+<FONT color="green">547</FONT>            if (foundInSpam) {<a name="line.547"></a>
+<FONT color="green">548</FONT>                spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();<a name="line.548"></a>
+<FONT color="green">549</FONT>                if (!foundInHam) {<a name="line.549"></a>
+<FONT color="green">550</FONT>                    maxThreshold = (spamFactor &gt; 10) ? 0.9999 : 0.9998;<a name="line.550"></a>
+<FONT color="green">551</FONT>                }<a name="line.551"></a>
+<FONT color="green">552</FONT>            }<a name="line.552"></a>
+<FONT color="green">553</FONT>    <a name="line.553"></a>
+<FONT color="green">554</FONT>            if ((hamFactor + spamFactor) &lt; 5) {<a name="line.554"></a>
+<FONT color="green">555</FONT>                // This token hasn't been seen enough.<a name="line.555"></a>
+<FONT color="green">556</FONT>                return 0.4;<a name="line.556"></a>
+<FONT color="green">557</FONT>            }<a name="line.557"></a>
+<FONT color="green">558</FONT>    <a name="line.558"></a>
+<FONT color="green">559</FONT>            double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);<a name="line.559"></a>
+<FONT color="green">560</FONT>            double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);<a name="line.560"></a>
+<FONT color="green">561</FONT>    <a name="line.561"></a>
+<FONT color="green">562</FONT>            return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));<a name="line.562"></a>
+<FONT color="green">563</FONT>        }<a name="line.563"></a>
+<FONT color="green">564</FONT>    <a name="line.564"></a>
+<FONT color="green">565</FONT>        /**<a name="line.565"></a>
+<FONT color="green">566</FONT>         * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and<a name="line.566"></a>
+<FONT color="green">567</FONT>         * the tokens passed in the "tokens" Set. The ordering is from the highest<a name="line.567"></a>
+<FONT color="green">568</FONT>         * strength to the lowest strength.<a name="line.568"></a>
+<FONT color="green">569</FONT>         * <a name="line.569"></a>
+<FONT color="green">570</FONT>         * @param tokens<a name="line.570"></a>
+<FONT color="green">571</FONT>         * @param workCorpus<a name="line.571"></a>
+<FONT color="green">572</FONT>         * @return SortedSet of TokenProbabilityStrength objects.<a name="line.572"></a>
+<FONT color="green">573</FONT>         */<a name="line.573"></a>
+<FONT color="green">574</FONT>        private SortedSet&lt;TokenProbabilityStrength&gt; getTokenProbabilityStrengths(Set&lt;String&gt; tokens, Map&lt;String, Double&gt; workCorpus) {<a name="line.574"></a>
+<FONT color="green">575</FONT>            // Convert to a SortedSet of token probability strengths.<a name="line.575"></a>
+<FONT color="green">576</FONT>            SortedSet&lt;TokenProbabilityStrength&gt; tokenProbabilityStrengths = new TreeSet&lt;TokenProbabilityStrength&gt;();<a name="line.576"></a>
+<FONT color="green">577</FONT>    <a name="line.577"></a>
+<FONT color="green">578</FONT>            Iterator&lt;String&gt; i = tokens.iterator();<a name="line.578"></a>
+<FONT color="green">579</FONT>            while (i.hasNext()) {<a name="line.579"></a>
+<FONT color="green">580</FONT>                TokenProbabilityStrength tps = new TokenProbabilityStrength();<a name="line.580"></a>
+<FONT color="green">581</FONT>    <a name="line.581"></a>
+<FONT color="green">582</FONT>                tps.token = (String) i.next();<a name="line.582"></a>
+<FONT color="green">583</FONT>    <a name="line.583"></a>
+<FONT color="green">584</FONT>                if (workCorpus.containsKey(tps.token)) {<a name="line.584"></a>
+<FONT color="green">585</FONT>                    tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());<a name="line.585"></a>
+<FONT color="green">586</FONT>                } else {<a name="line.586"></a>
+<FONT color="green">587</FONT>                    // This token has never been seen before,<a name="line.587"></a>
+<FONT color="green">588</FONT>                    // we'll give it initially the default probability.<a name="line.588"></a>
+<FONT color="green">589</FONT>                    Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);<a name="line.589"></a>
+<FONT color="green">590</FONT>                    tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);<a name="line.590"></a>
+<FONT color="green">591</FONT>                    boolean isTokenDegeneratedFound = false;<a name="line.591"></a>
+<FONT color="green">592</FONT>    <a name="line.592"></a>
+<FONT color="green">593</FONT>                    Collection&lt;String&gt; degeneratedTokens = buildDegenerated(tps.token);<a name="line.593"></a>
+<FONT color="green">594</FONT>                    Iterator&lt;String&gt; iDegenerated = degeneratedTokens.iterator();<a name="line.594"></a>
+<FONT color="green">595</FONT>                    String tokenDegenerated = null;<a name="line.595"></a>
+<FONT color="green">596</FONT>                    double strengthDegenerated;<a name="line.596"></a>
+<FONT color="green">597</FONT>                    while (iDegenerated.hasNext()) {<a name="line.597"></a>
+<FONT color="green">598</FONT>                        tokenDegenerated = (String) iDegenerated.next();<a name="line.598"></a>
+<FONT color="green">599</FONT>                        if (workCorpus.containsKey(tokenDegenerated)) {<a name="line.599"></a>
+<FONT color="green">600</FONT>                            Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);<a name="line.600"></a>
+<FONT color="green">601</FONT>                            strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());<a name="line.601"></a>
+<FONT color="green">602</FONT>                            if (strengthDegenerated &gt; tps.strength) {<a name="line.602"></a>
+<FONT color="green">603</FONT>                                isTokenDegeneratedFound = true;<a name="line.603"></a>
+<FONT color="green">604</FONT>                                tps.strength = strengthDegenerated;<a name="line.604"></a>
+<FONT color="green">605</FONT>                                corpusProbability = probabilityTemp;<a name="line.605"></a>
+<FONT color="green">606</FONT>                            }<a name="line.606"></a>
+<FONT color="green">607</FONT>                        }<a name="line.607"></a>
+<FONT color="green">608</FONT>                    }<a name="line.608"></a>
+<FONT color="green">609</FONT>                    // to reduce memory usage, put in the corpus only if the<a name="line.609"></a>
+<FONT color="green">610</FONT>                    // probability is different from (stronger than) the default<a name="line.610"></a>
+<FONT color="green">611</FONT>                    if (isTokenDegeneratedFound) {<a name="line.611"></a>
+<FONT color="green">612</FONT>                        synchronized (workCorpus) {<a name="line.612"></a>
+<FONT color="green">613</FONT>                            workCorpus.put(tps.token, corpusProbability);<a name="line.613"></a>
+<FONT color="green">614</FONT>                        }<a name="line.614"></a>
+<FONT color="green">615</FONT>                    }<a name="line.615"></a>
+<FONT color="green">616</FONT>                }<a name="line.616"></a>
+<FONT color="green">617</FONT>    <a name="line.617"></a>
+<FONT color="green">618</FONT>                tokenProbabilityStrengths.add(tps);<a name="line.618"></a>
+<FONT color="green">619</FONT>            }<a name="line.619"></a>
+<FONT color="green">620</FONT>    <a name="line.620"></a>
+<FONT color="green">621</FONT>            return tokenProbabilityStrengths;<a name="line.621"></a>
+<FONT color="green">622</FONT>        }<a name="line.622"></a>
+<FONT color="green">623</FONT>    <a name="line.623"></a>
+<FONT color="green">624</FONT>        private Collection&lt;String&gt; buildDegenerated(String fullToken) {<a name="line.624"></a>
+<FONT color="green">625</FONT>            ArrayList&lt;String&gt; tokens = new ArrayList&lt;String&gt;();<a name="line.625"></a>
+<FONT color="green">626</FONT>            String header;<a name="line.626"></a>
+<FONT color="green">627</FONT>            String token;<a name="line.627"></a>
+<FONT color="green">628</FONT>            String tokenLower;<a name="line.628"></a>
+<FONT color="green">629</FONT>    <a name="line.629"></a>
+<FONT color="green">630</FONT>            // look for a header string termination<a name="line.630"></a>
+<FONT color="green">631</FONT>            int headerEnd = fullToken.indexOf(':');<a name="line.631"></a>
+<FONT color="green">632</FONT>            if (headerEnd &gt;= 0) {<a name="line.632"></a>
+<FONT color="green">633</FONT>                header = fullToken.substring(0, headerEnd);<a name="line.633"></a>
+<FONT color="green">634</FONT>                token = fullToken.substring(headerEnd);<a name="line.634"></a>
+<FONT color="green">635</FONT>            } else {<a name="line.635"></a>
+<FONT color="green">636</FONT>                header = "";<a name="line.636"></a>
+<FONT color="green">637</FONT>                token = fullToken;<a name="line.637"></a>
+<FONT color="green">638</FONT>            }<a name="line.638"></a>
+<FONT color="green">639</FONT>    <a name="line.639"></a>
+<FONT color="green">640</FONT>            // prepare a version of the token containing all lower case (for<a name="line.640"></a>
+<FONT color="green">641</FONT>            // performance reasons)<a name="line.641"></a>
+<FONT color="green">642</FONT>            tokenLower = token.toLowerCase();<a name="line.642"></a>
+<FONT color="green">643</FONT>    <a name="line.643"></a>
+<FONT color="green">644</FONT>            int end = token.length();<a name="line.644"></a>
+<FONT color="green">645</FONT>            do {<a name="line.645"></a>
+<FONT color="green">646</FONT>                if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {<a name="line.646"></a>
+<FONT color="green">647</FONT>                    tokens.add(header + tokenLower.substring(0, end));<a name="line.647"></a>
+<FONT color="green">648</FONT>                    if (header.length() &gt; 0) {<a name="line.648"></a>
+<FONT color="green">649</FONT>                        tokens.add(tokenLower.substring(0, end));<a name="line.649"></a>
+<FONT color="green">650</FONT>                    }<a name="line.650"></a>
+<FONT color="green">651</FONT>                }<a name="line.651"></a>
+<FONT color="green">652</FONT>                if (end &gt; 1 &amp;&amp; token.charAt(0) &gt;= 'A' &amp;&amp; token.charAt(0) &lt;= 'Z') {<a name="line.652"></a>
+<FONT color="green">653</FONT>                    tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));<a name="line.653"></a>
+<FONT color="green">654</FONT>                    if (header.length() &gt; 0) {<a name="line.654"></a>
+<FONT color="green">655</FONT>                        tokens.add(token.charAt(0) + tokenLower.substring(1, end));<a name="line.655"></a>
+<FONT color="green">656</FONT>                    }<a name="line.656"></a>
+<FONT color="green">657</FONT>                }<a name="line.657"></a>
+<FONT color="green">658</FONT>    <a name="line.658"></a>
+<FONT color="green">659</FONT>                if (token.charAt(end - 1) != '!') {<a name="line.659"></a>
+<FONT color="green">660</FONT>                    break;<a name="line.660"></a>
+<FONT color="green">661</FONT>                }<a name="line.661"></a>
+<FONT color="green">662</FONT>    <a name="line.662"></a>
+<FONT color="green">663</FONT>                end--;<a name="line.663"></a>
+<FONT color="green">664</FONT>    <a name="line.664"></a>
+<FONT color="green">665</FONT>                tokens.add(header + token.substring(0, end));<a name="line.665"></a>
+<FONT color="green">666</FONT>                if (header.length() &gt; 0) {<a name="line.666"></a>
+<FONT color="green">667</FONT>                    tokens.add(token.substring(0, end));<a name="line.667"></a>
+<FONT color="green">668</FONT>                }<a name="line.668"></a>
+<FONT color="green">669</FONT>            } while (end &gt; 0);<a name="line.669"></a>
+<FONT color="green">670</FONT>    <a name="line.670"></a>
+<FONT color="green">671</FONT>            return tokens;<a name="line.671"></a>
+<FONT color="green">672</FONT>        }<a name="line.672"></a>
+<FONT color="green">673</FONT>    <a name="line.673"></a>
+<FONT color="green">674</FONT>        /**<a name="line.674"></a>
+<FONT color="green">675</FONT>         * Compute the spamminess probability of the interesting tokens in the<a name="line.675"></a>
+<FONT color="green">676</FONT>         * tokenProbabilities SortedSet.<a name="line.676"></a>
+<FONT color="green">677</FONT>         * <a name="line.677"></a>
+<FONT color="green">678</FONT>         * @param tokenProbabilityStrengths<a name="line.678"></a>
+<FONT color="green">679</FONT>         * @param workCorpus<a name="line.679"></a>
+<FONT color="green">680</FONT>         * @return Computed spamminess.<a name="line.680"></a>
+<FONT color="green">681</FONT>         */<a name="line.681"></a>
+<FONT color="green">682</FONT>        private double computeOverallProbability(SortedSet&lt;TokenProbabilityStrength&gt; tokenProbabilityStrengths, Map&lt;String, Double&gt; workCorpus) {<a name="line.682"></a>
+<FONT color="green">683</FONT>            double p = 1.0;<a name="line.683"></a>
+<FONT color="green">684</FONT>            double np = 1.0;<a name="line.684"></a>
+<FONT color="green">685</FONT>            double tempStrength = 0.5;<a name="line.685"></a>
+<FONT color="green">686</FONT>            int count = MAX_INTERESTING_TOKENS;<a name="line.686"></a>
+<FONT color="green">687</FONT>            Iterator&lt;TokenProbabilityStrength&gt; iterator = tokenProbabilityStrengths.iterator();<a name="line.687"></a>
+<FONT color="green">688</FONT>            while ((iterator.hasNext()) &amp;&amp; (count-- &gt; 0 || tempStrength &gt;= INTERESTINGNESS_THRESHOLD)) {<a name="line.688"></a>
+<FONT color="green">689</FONT>                TokenProbabilityStrength tps = iterator.next();<a name="line.689"></a>
+<FONT color="green">690</FONT>                tempStrength = tps.strength;<a name="line.690"></a>
+<FONT color="green">691</FONT>    <a name="line.691"></a>
+<FONT color="green">692</FONT>                // System.out.println(tps);<a name="line.692"></a>
+<FONT color="green">693</FONT>    <a name="line.693"></a>
+<FONT color="green">694</FONT>                double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it<a name="line.694"></a>
+<FONT color="green">695</FONT>                                                                   // to the default<a name="line.695"></a>
+<FONT color="green">696</FONT>                Double theDoubleObject = (Double) workCorpus.get(tps.token);<a name="line.696"></a>
+<FONT color="green">697</FONT>                // if either the original token or a degeneration was found use the<a name="line.697"></a>
+<FONT color="green">698</FONT>                // double value, otherwise use the default<a name="line.698"></a>
+<FONT color="green">699</FONT>                if (theDoubleObject != null) {<a name="line.699"></a>
+<FONT color="green">700</FONT>                    theDoubleValue = theDoubleObject.doubleValue();<a name="line.700"></a>
+<FONT color="green">701</FONT>                }<a name="line.701"></a>
+<FONT color="green">702</FONT>                p *= theDoubleValue;<a name="line.702"></a>
+<FONT color="green">703</FONT>                np *= (1.0 - theDoubleValue);<a name="line.703"></a>
+<FONT color="green">704</FONT>                // System.out.println("Token " + tps + ", p=" + theDoubleValue +<a name="line.704"></a>
+<FONT color="green">705</FONT>                // ", overall p=" + p / (p + np));<a name="line.705"></a>
+<FONT color="green">706</FONT>            }<a name="line.706"></a>
+<FONT color="green">707</FONT>    <a name="line.707"></a>
+<FONT color="green">708</FONT>            return (p / (p + np));<a name="line.708"></a>
+<FONT color="green">709</FONT>        }<a name="line.709"></a>
+<FONT color="green">710</FONT>    <a name="line.710"></a>
+<FONT color="green">711</FONT>        private boolean allDigits(String s) {<a name="line.711"></a>
+<FONT color="green">712</FONT>            for (int i = 0; i &lt; s.length(); i++) {<a name="line.712"></a>
+<FONT color="green">713</FONT>                if (!Character.isDigit(s.charAt(i))) {<a name="line.713"></a>
+<FONT color="green">714</FONT>                    return false;<a name="line.714"></a>
+<FONT color="green">715</FONT>                }<a name="line.715"></a>
+<FONT color="green">716</FONT>            }<a name="line.716"></a>
+<FONT color="green">717</FONT>            return true;<a name="line.717"></a>
+<FONT color="green">718</FONT>        }<a name="line.718"></a>
+<FONT color="green">719</FONT>    }<a name="line.719"></a>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+</PRE>
+</BODY>
+</HTML>

Propchange: james/site/trunk/www/mailet/ai/apidocs/src-html/org/apache/james/ai/classic/BayesianAnalyzer.html
------------------------------------------------------------------------------
    svn:eol-style = native