<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="ru">
		<id>http://www.jexp.ru/index.php?action=history&amp;feed=atom&amp;title=Java%2FEmail%2FSpam</id>
		<title>Java/Email/Spam - История изменений</title>
		<link rel="self" type="application/atom+xml" href="http://www.jexp.ru/index.php?action=history&amp;feed=atom&amp;title=Java%2FEmail%2FSpam"/>
		<link rel="alternate" type="text/html" href="http://www.jexp.ru/index.php?title=Java/Email/Spam&amp;action=history"/>
		<updated>2026-04-21T20:27:05Z</updated>
		<subtitle>История изменений этой страницы в вики</subtitle>
		<generator>MediaWiki 1.30.0</generator>

	<entry>
		<id>http://www.jexp.ru/index.php?title=Java/Email/Spam&amp;diff=6673&amp;oldid=prev</id>
		<title>Admin: 1 версия</title>
		<link rel="alternate" type="text/html" href="http://www.jexp.ru/index.php?title=Java/Email/Spam&amp;diff=6673&amp;oldid=prev"/>
				<updated>2010-06-01T06:17:10Z</updated>
		
		<summary type="html">&lt;p&gt;1 версия&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;tr style=&quot;vertical-align: top;&quot; lang=&quot;ru&quot;&gt;
				&lt;td colspan=&quot;1&quot; style=&quot;background-color: white; color:black; text-align: center;&quot;&gt;← Предыдущая&lt;/td&gt;
				&lt;td colspan=&quot;1&quot; style=&quot;background-color: white; color:black; text-align: center;&quot;&gt;Версия 06:17, 1 июня 2010&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; style=&quot;text-align: center;&quot; lang=&quot;ru&quot;&gt;&lt;div class=&quot;mw-diff-empty&quot;&gt;(нет различий)&lt;/div&gt;
&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;</summary>
		<author><name>Admin</name></author>	</entry>

	<entry>
		<id>http://www.jexp.ru/index.php?title=Java/Email/Spam&amp;diff=6672&amp;oldid=prev</id>
		<title> в 18:01, 31 мая 2010</title>
		<link rel="alternate" type="text/html" href="http://www.jexp.ru/index.php?title=Java/Email/Spam&amp;diff=6672&amp;oldid=prev"/>
				<updated>2010-05-31T18:01:44Z</updated>
		
		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;Новая страница&lt;/b&gt;&lt;/p&gt;&lt;div&gt;== Determines probability that text contains Spam ==&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
   &lt;br /&gt;
  &amp;lt;!-- start source code --&amp;gt;&lt;br /&gt;
   &lt;br /&gt;
    &amp;lt;source lang=&amp;quot;java&amp;quot;&amp;gt;&lt;br /&gt;
   &lt;br /&gt;
/****************************************************************&lt;br /&gt;
 * Licensed to the Apache Software Foundation (ASF) under one   *&lt;br /&gt;
 * or more contributor license agreements.  See the NOTICE file *&lt;br /&gt;
 * distributed with this work for additional information        *&lt;br /&gt;
 * regarding copyright ownership.  The ASF licenses this file   *&lt;br /&gt;
 * to you under the Apache License, Version 2.0 (the            *&lt;br /&gt;
 * &amp;quot;License&amp;quot;); you may not use this file except in compliance   *&lt;br /&gt;
 * with the License.  You may obtain a copy of the License at   *&lt;br /&gt;
 *                                                              *&lt;br /&gt;
 *   http://www.apache.org/licenses/LICENSE-2.0                 *&lt;br /&gt;
 *                                                              *&lt;br /&gt;
 * Unless required by applicable law or agreed to in writing,   *&lt;br /&gt;
 * software distributed under the License is distributed on an  *&lt;br /&gt;
 * &amp;quot;AS IS&amp;quot; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *&lt;br /&gt;
 * KIND, either express or implied.  See the License for the    *&lt;br /&gt;
 * specific language governing permissions and limitations      *&lt;br /&gt;
 * under the License.                                           *&lt;br /&gt;
 ****************************************************************/&lt;br /&gt;
// Revised from apache james&lt;br /&gt;
import java.util.Map;&lt;br /&gt;
import java.util.Set;&lt;br /&gt;
import java.util.SortedSet;&lt;br /&gt;
import java.util.TreeSet;&lt;br /&gt;
import java.util.HashMap;&lt;br /&gt;
import java.util.HashSet;&lt;br /&gt;
import java.util.Iterator;&lt;br /&gt;
import java.util.Collection;&lt;br /&gt;
import java.util.ArrayList;&lt;br /&gt;
import java.io.Reader;&lt;br /&gt;
import java.io.StreamTokenizer;&lt;br /&gt;
import java.io.StringReader;&lt;br /&gt;
/**&lt;br /&gt;
 * &amp;lt;P&amp;gt;Determines probability that text contains Spam.&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * &amp;lt;P&amp;gt;Based upon Paul Grahams&amp;quot; .&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * &amp;lt;P&amp;gt;Sample method usage:&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * &amp;lt;P&amp;gt;Use:&lt;br /&gt;
 *   void addHam(Reader)&lt;br /&gt;
 *   and&lt;br /&gt;
 *   void addSpam(Reader)&lt;br /&gt;
 *&lt;br /&gt;
 *   methods to build up the Maps of ham &amp;amp; spam tokens/occurrences.&lt;br /&gt;
 *   Both addHam and addSpam assume they&amp;quot;re reading one message at a time,&lt;br /&gt;
 *   if you feed more than one message per call, be sure to adjust the&lt;br /&gt;
 *   appropriate message counter:  hamMessageCount or spamMessageCount.&lt;br /&gt;
 *&lt;br /&gt;
 *   Then...&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * &amp;lt;P&amp;gt;Use:&lt;br /&gt;
 *   void buildCorpus()&lt;br /&gt;
 *&lt;br /&gt;
 *   to build the final token/probabilities Map.&lt;br /&gt;
 *&lt;br /&gt;
 *   Use your own methods for persistent storage of either the individual&lt;br /&gt;
 *   ham/spam corpus &amp;amp; message counts, and/or the final corpus.&lt;br /&gt;
 *&lt;br /&gt;
 *   Then you can...&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * &amp;lt;P&amp;gt;Use:&lt;br /&gt;
 *   double computeSpamProbability(Reader)&lt;br /&gt;
 *&lt;br /&gt;
 *   to determine the probability that a particular text contains spam.&lt;br /&gt;
 *   A returned result of 0.9 or above is an indicator that the text was&lt;br /&gt;
 *   spam.&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * &amp;lt;P&amp;gt;If you use persistent storage, use:&lt;br /&gt;
 *   void setCorpus(Map)&lt;br /&gt;
 *&lt;br /&gt;
 * before calling computeSpamProbability.&amp;lt;/P&amp;gt;&lt;br /&gt;
 *&lt;br /&gt;
 * @version CVS $Revision: $ $Date: $&lt;br /&gt;
 * @since 2.3.0&lt;br /&gt;
 */&lt;br /&gt;
public class BayesianAnalyzer {&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Number of &amp;quot;interesting&amp;quot; tokens to use to compute overall&lt;br /&gt;
     * spamminess probability.&lt;br /&gt;
     */&lt;br /&gt;
    private final static int MAX_INTERESTING_TOKENS = 15;&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Minimum probability distance from 0.5 to consider a token &amp;quot;interesting&amp;quot; to use to compute overall&lt;br /&gt;
     * spamminess probability.&lt;br /&gt;
     */&lt;br /&gt;
    private final static double INTERESTINGNESS_THRESHOLD = 0.46;&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Default token probability to use when a token has not been&lt;br /&gt;
     * encountered before.&lt;br /&gt;
     */&lt;br /&gt;
    private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Map of ham tokens and their occurrences.&lt;br /&gt;
     *&lt;br /&gt;
     * String key&lt;br /&gt;
     * Integer value&lt;br /&gt;
     */&lt;br /&gt;
    private Map hamTokenCounts = new HashMap();&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Map of spam tokens and their occurrences.&lt;br /&gt;
     *&lt;br /&gt;
     * String key&lt;br /&gt;
     * Integer value&lt;br /&gt;
     */&lt;br /&gt;
    private Map spamTokenCounts = new HashMap();&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Number of ham messages analyzed.&lt;br /&gt;
     */&lt;br /&gt;
    private int hamMessageCount = 0;&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Number of spam messages analyzed.&lt;br /&gt;
     */&lt;br /&gt;
    private int spamMessageCount = 0;&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Final token/probability corpus.&lt;br /&gt;
     *&lt;br /&gt;
     * String key&lt;br /&gt;
     * Double value&lt;br /&gt;
     */&lt;br /&gt;
    private Map corpus = new HashMap();&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Inner class for managing Token Probability Strengths during the&lt;br /&gt;
     * computeSpamProbability phase.&lt;br /&gt;
     *&lt;br /&gt;
     * By probability &amp;lt;i&amp;gt;strength&amp;lt;/i&amp;gt; we mean the absolute distance of a&lt;br /&gt;
     * probability from the middle value 0.5.&lt;br /&gt;
     *&lt;br /&gt;
     * It implements Comparable so that it&amp;quot;s sorting is automatic.&lt;br /&gt;
     */&lt;br /&gt;
    private class TokenProbabilityStrength&lt;br /&gt;
    implements Comparable {&lt;br /&gt;
        /**&lt;br /&gt;
         * Message token.&lt;br /&gt;
         */&lt;br /&gt;
        String token = null;&lt;br /&gt;
        &lt;br /&gt;
        /**&lt;br /&gt;
         * Token&amp;quot;s computed probability strength.&lt;br /&gt;
         */&lt;br /&gt;
        double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);&lt;br /&gt;
        &lt;br /&gt;
        /**&lt;br /&gt;
         * Force the natural sort order for this object to be high-to-low.&lt;br /&gt;
         *&lt;br /&gt;
         * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare&lt;br /&gt;
         *                                this instance with.&lt;br /&gt;
         *&lt;br /&gt;
         * @return The result of the comparison (before, equal, after).&lt;br /&gt;
         */&lt;br /&gt;
        public final int compareTo(Object anotherTokenProbabilityStrength) {&lt;br /&gt;
            int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);&lt;br /&gt;
            if (result == 0) {&lt;br /&gt;
                return this.token.rupareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);&lt;br /&gt;
            } else {&lt;br /&gt;
                return result;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        /**&lt;br /&gt;
         * Simple toString () implementation mostly for debugging purposes.&lt;br /&gt;
         *&lt;br /&gt;
         * @return String representation of this object.&lt;br /&gt;
         */&lt;br /&gt;
        public String toString() {&lt;br /&gt;
            StringBuffer sb = new StringBuffer(30);&lt;br /&gt;
            &lt;br /&gt;
            sb.append(token)&lt;br /&gt;
            .append(&amp;quot;=&amp;quot;)&lt;br /&gt;
            .append(strength);&lt;br /&gt;
            &lt;br /&gt;
            return sb.toString();&lt;br /&gt;
        }&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Basic class constructor.&lt;br /&gt;
     */&lt;br /&gt;
    public BayesianAnalyzer() {&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public setter for the hamTokenCounts Map.&lt;br /&gt;
     *&lt;br /&gt;
     * @param hamTokenCounts The new ham Token counts Map.&lt;br /&gt;
     */&lt;br /&gt;
    public void setHamTokenCounts(Map hamTokenCounts) {&lt;br /&gt;
        this.hamTokenCounts = hamTokenCounts;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public getter for the hamTokenCounts Map.&lt;br /&gt;
     */&lt;br /&gt;
    public Map getHamTokenCounts() {&lt;br /&gt;
        return this.hamTokenCounts;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public setter for the spamTokenCounts Map.&lt;br /&gt;
     *&lt;br /&gt;
     * @param spamTokenCounts The new spam Token counts Map.&lt;br /&gt;
     */&lt;br /&gt;
    public void setSpamTokenCounts(Map spamTokenCounts) {&lt;br /&gt;
        this.spamTokenCounts = spamTokenCounts;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public getter for the spamTokenCounts Map.&lt;br /&gt;
     */&lt;br /&gt;
    public Map getSpamTokenCounts() {&lt;br /&gt;
        return this.spamTokenCounts;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public setter for spamMessageCount.&lt;br /&gt;
     *&lt;br /&gt;
     * @param spamMessageCount The new spam message count.&lt;br /&gt;
     */&lt;br /&gt;
    public void setSpamMessageCount(int spamMessageCount) {&lt;br /&gt;
        this.spamMessageCount = spamMessageCount;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public getter for spamMessageCount.&lt;br /&gt;
     */&lt;br /&gt;
    public int getSpamMessageCount() {&lt;br /&gt;
        return this.spamMessageCount;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public setter for hamMessageCount.&lt;br /&gt;
     *&lt;br /&gt;
     * @param hamMessageCount The new ham message count.&lt;br /&gt;
     */&lt;br /&gt;
    public void setHamMessageCount(int hamMessageCount) {&lt;br /&gt;
        this.hamMessageCount = hamMessageCount;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public getter for hamMessageCount.&lt;br /&gt;
     */&lt;br /&gt;
    public int getHamMessageCount() {&lt;br /&gt;
        return this.hamMessageCount;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Clears all analysis repositories and counters.&lt;br /&gt;
     */&lt;br /&gt;
    public void clear() {&lt;br /&gt;
        corpus.clear();&lt;br /&gt;
        &lt;br /&gt;
        tokenCountsClear();&lt;br /&gt;
        &lt;br /&gt;
        hamMessageCount = 0;&lt;br /&gt;
        spamMessageCount = 0;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Clears token counters.&lt;br /&gt;
     */&lt;br /&gt;
    public void tokenCountsClear() {&lt;br /&gt;
        hamTokenCounts.clear();&lt;br /&gt;
        spamTokenCounts.clear();&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public setter for corpus.&lt;br /&gt;
     *&lt;br /&gt;
     * @param corpus The new corpus.&lt;br /&gt;
     */&lt;br /&gt;
    public void setCorpus(Map corpus) {&lt;br /&gt;
        this.corpus = corpus;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Public getter for corpus.&lt;br /&gt;
     */&lt;br /&gt;
    public Map getCorpus() {&lt;br /&gt;
        return this.corpus;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Builds the corpus from the existing ham &amp;amp; spam counts.&lt;br /&gt;
     */&lt;br /&gt;
    public void buildCorpus() {&lt;br /&gt;
        //Combine the known ham &amp;amp; spam tokens.&lt;br /&gt;
        Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());&lt;br /&gt;
        set.addAll(hamTokenCounts.keySet());&lt;br /&gt;
        set.addAll(spamTokenCounts.keySet());&lt;br /&gt;
        Map tempCorpus = new HashMap(set.size());&lt;br /&gt;
        &lt;br /&gt;
        //Iterate through all the tokens and compute their new&lt;br /&gt;
        //individual probabilities.&lt;br /&gt;
        Iterator i = set.iterator();&lt;br /&gt;
        while (i.hasNext()) {&lt;br /&gt;
            String token = (String) i.next();&lt;br /&gt;
            tempCorpus.put(token, new Double(computeProbability(token)));&lt;br /&gt;
        }&lt;br /&gt;
        setCorpus(tempCorpus);&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Adds a message to the ham list.&lt;br /&gt;
     * @param stream A reader stream on the ham message to analyze&lt;br /&gt;
     * @throws IOException If any error occurs&lt;br /&gt;
     */&lt;br /&gt;
    public void addHam(Reader stream)&lt;br /&gt;
    throws java.io.IOException {&lt;br /&gt;
        addTokenOccurrences(stream, hamTokenCounts);&lt;br /&gt;
        hamMessageCount++;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Adds a message to the spam list.&lt;br /&gt;
     * @param stream A reader stream on the spam message to analyze&lt;br /&gt;
     * @throws IOException If any error occurs&lt;br /&gt;
     */&lt;br /&gt;
    public void addSpam(Reader stream)&lt;br /&gt;
    throws java.io.IOException {&lt;br /&gt;
        addTokenOccurrences(stream, spamTokenCounts);&lt;br /&gt;
        spamMessageCount++;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Computes the probability that the stream contains SPAM.&lt;br /&gt;
     * @param stream The text to be analyzed for Spamminess.&lt;br /&gt;
     * @return A 0.0 - 1.0 probability&lt;br /&gt;
     * @throws IOException If any error occurs&lt;br /&gt;
     */&lt;br /&gt;
    public double computeSpamProbability(Reader stream)&lt;br /&gt;
    throws java.io.IOException {&lt;br /&gt;
        //Build a set of the tokens in the Stream.&lt;br /&gt;
        Set tokens = parse(stream);&lt;br /&gt;
        &lt;br /&gt;
        // Get the corpus to use in this run&lt;br /&gt;
        // A new corpus may be being built in the meantime&lt;br /&gt;
        Map workCorpus = getCorpus();&lt;br /&gt;
        &lt;br /&gt;
        //Assign their probabilities from the Corpus (using an additional&lt;br /&gt;
        //calculation to determine spamminess).&lt;br /&gt;
        SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);&lt;br /&gt;
        &lt;br /&gt;
        //Compute and return the overall probability that the&lt;br /&gt;
        //stream is SPAM.&lt;br /&gt;
        return computeOverallProbability(tokenProbabilityStrengths, workCorpus);&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Parses a stream into tokens, and updates the target Map&lt;br /&gt;
     * with the token/counts.&lt;br /&gt;
     *&lt;br /&gt;
     * @param stream&lt;br /&gt;
     * @param target&lt;br /&gt;
     */&lt;br /&gt;
    private void addTokenOccurrences(Reader stream, Map target)&lt;br /&gt;
    throws java.io.IOException {&lt;br /&gt;
        String token;&lt;br /&gt;
        String header = &amp;quot;&amp;quot;;&lt;br /&gt;
        &lt;br /&gt;
        //Update target with the tokens/count encountered.&lt;br /&gt;
        while ((token = nextToken(stream)) != null) {&lt;br /&gt;
            boolean endingLine = false;&lt;br /&gt;
            if (token.length() &amp;gt; 0 &amp;amp;&amp;amp; token.charAt(token.length() - 1) == &amp;quot;\n&amp;quot;) {&lt;br /&gt;
                endingLine = true;&lt;br /&gt;
                token = token.substring(0, token.length() - 1);&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            if (token.length() &amp;gt; 0 &amp;amp;&amp;amp; header.length() + token.length() &amp;lt; 90 &amp;amp;&amp;amp; !allDigits(token)) {&lt;br /&gt;
                if (token.equals(&amp;quot;From:&amp;quot;)&lt;br /&gt;
                || token.equals(&amp;quot;Return-Path:&amp;quot;)&lt;br /&gt;
                || token.equals(&amp;quot;Subject:&amp;quot;)&lt;br /&gt;
                || token.equals(&amp;quot;To:&amp;quot;)&lt;br /&gt;
                ) {&lt;br /&gt;
                    header = token;&lt;br /&gt;
                    if (!endingLine) {&lt;br /&gt;
                        continue;&lt;br /&gt;
                    }&lt;br /&gt;
                }&lt;br /&gt;
                &lt;br /&gt;
                token = header + token;&lt;br /&gt;
                &lt;br /&gt;
                Integer value = null;&lt;br /&gt;
                &lt;br /&gt;
                if (target.containsKey(token)) {&lt;br /&gt;
                    value = new Integer(((Integer) target.get(token)).intValue() + 1);&lt;br /&gt;
                } else {&lt;br /&gt;
                    value = new Integer(1);&lt;br /&gt;
                }&lt;br /&gt;
                &lt;br /&gt;
                target.put(token, value);&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            if (endingLine) {&lt;br /&gt;
                header = &amp;quot;&amp;quot;;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Parses a stream into tokens, and returns a Set of&lt;br /&gt;
     * the unique tokens encountered.&lt;br /&gt;
     *&lt;br /&gt;
     * @param stream&lt;br /&gt;
     * @return Set&lt;br /&gt;
     */&lt;br /&gt;
    private Set parse(Reader stream)&lt;br /&gt;
    throws java.io.IOException {&lt;br /&gt;
        Set tokens = new HashSet();&lt;br /&gt;
        String token;&lt;br /&gt;
        String header = &amp;quot;&amp;quot;;&lt;br /&gt;
        &lt;br /&gt;
        //Build a Map of tokens encountered.&lt;br /&gt;
        while ((token = nextToken(stream)) != null) {&lt;br /&gt;
            boolean endingLine = false;&lt;br /&gt;
            if (token.length() &amp;gt; 0 &amp;amp;&amp;amp; token.charAt(token.length() - 1) == &amp;quot;\n&amp;quot;) {&lt;br /&gt;
                endingLine = true;&lt;br /&gt;
                token = token.substring(0, token.length() - 1);&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            if (token.length() &amp;gt; 0 &amp;amp;&amp;amp; header.length() + token.length() &amp;lt; 90 &amp;amp;&amp;amp; !allDigits(token)) {&lt;br /&gt;
                if (token.equals(&amp;quot;From:&amp;quot;)&lt;br /&gt;
                || token.equals(&amp;quot;Return-Path:&amp;quot;)&lt;br /&gt;
                || token.equals(&amp;quot;Subject:&amp;quot;)&lt;br /&gt;
                || token.equals(&amp;quot;To:&amp;quot;)&lt;br /&gt;
                ) {&lt;br /&gt;
                    header = token;&lt;br /&gt;
                    if (!endingLine) {&lt;br /&gt;
                        continue;&lt;br /&gt;
                    }&lt;br /&gt;
                }&lt;br /&gt;
                &lt;br /&gt;
                token = header + token;&lt;br /&gt;
                &lt;br /&gt;
                tokens.add(token);&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            if (endingLine) {&lt;br /&gt;
                header = &amp;quot;&amp;quot;;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        //Return the unique set of tokens encountered.&lt;br /&gt;
        return tokens;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    private String nextToken(Reader reader) throws java.io.IOException {&lt;br /&gt;
        StringBuffer token = new StringBuffer();&lt;br /&gt;
        int i;&lt;br /&gt;
        char ch, ch2;&lt;br /&gt;
        boolean previousWasDigit = false;&lt;br /&gt;
        boolean tokenCharFound = false;&lt;br /&gt;
        &lt;br /&gt;
        if (!reader.ready()) {&lt;br /&gt;
            return null;&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        while ((i = reader.read()) != -1) {&lt;br /&gt;
            &lt;br /&gt;
            ch = (char) i;&lt;br /&gt;
            &lt;br /&gt;
            if (ch == &amp;quot;:&amp;quot;) {&lt;br /&gt;
                String tokenString = token.toString() + &amp;quot;:&amp;quot;;&lt;br /&gt;
                if (tokenString.equals(&amp;quot;From:&amp;quot;)&lt;br /&gt;
                || tokenString.equals(&amp;quot;Return-Path:&amp;quot;)&lt;br /&gt;
                || tokenString.equals(&amp;quot;Subject:&amp;quot;)&lt;br /&gt;
                || tokenString.equals(&amp;quot;To:&amp;quot;)&lt;br /&gt;
                ) {&lt;br /&gt;
                    return tokenString;&lt;br /&gt;
                }&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            if (Character.isLetter(ch)&lt;br /&gt;
            || ch == &amp;quot;-&amp;quot;&lt;br /&gt;
            || ch == &amp;quot;$&amp;quot;&lt;br /&gt;
            || ch == &amp;quot;\u20AC&amp;quot; // the EURO symbol&lt;br /&gt;
            || ch == &amp;quot;!&amp;quot;&lt;br /&gt;
            || ch == &amp;quot;\&amp;quot;&amp;quot;&lt;br /&gt;
            ) {&lt;br /&gt;
                tokenCharFound = true;&lt;br /&gt;
                previousWasDigit = false;&lt;br /&gt;
                token.append(ch);&lt;br /&gt;
            } else if (Character.isDigit(ch)) {&lt;br /&gt;
                tokenCharFound = true;&lt;br /&gt;
                previousWasDigit = true;&lt;br /&gt;
                token.append(ch);&lt;br /&gt;
            } else if (previousWasDigit &amp;amp;&amp;amp; (ch == &amp;quot;.&amp;quot; || ch == &amp;quot;,&amp;quot;)) {&lt;br /&gt;
                reader.mark(1);&lt;br /&gt;
                previousWasDigit = false;&lt;br /&gt;
                i = reader.read();&lt;br /&gt;
                if (i == -1) {&lt;br /&gt;
                    break;&lt;br /&gt;
                }&lt;br /&gt;
                ch2 = (char) i;&lt;br /&gt;
                if (Character.isDigit(ch2)) {&lt;br /&gt;
                    tokenCharFound = true;&lt;br /&gt;
                    previousWasDigit = true;&lt;br /&gt;
                    token.append(ch);&lt;br /&gt;
                    token.append(ch2);&lt;br /&gt;
                } else {&lt;br /&gt;
                    reader.reset();&lt;br /&gt;
                    break;&lt;br /&gt;
                }&lt;br /&gt;
            } else if (ch == &amp;quot;\r&amp;quot;) {&lt;br /&gt;
                // cr found, ignore&lt;br /&gt;
            } else if (ch == &amp;quot;\n&amp;quot;) {&lt;br /&gt;
                // eol found&lt;br /&gt;
                tokenCharFound = true;&lt;br /&gt;
                previousWasDigit = false;&lt;br /&gt;
                token.append(ch);&lt;br /&gt;
                break;&lt;br /&gt;
            } else if (tokenCharFound) {&lt;br /&gt;
                break;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        if (tokenCharFound) {&lt;br /&gt;
            //          System.out.println(&amp;quot;Token read: &amp;quot; + token);&lt;br /&gt;
            return token.toString();&lt;br /&gt;
        } else {&lt;br /&gt;
            return null;&lt;br /&gt;
        }&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Compute the probability that &amp;quot;token&amp;quot; is SPAM.&lt;br /&gt;
     *&lt;br /&gt;
     * @param token&lt;br /&gt;
     * @return  The probability that the token occurs within spam.&lt;br /&gt;
     */&lt;br /&gt;
    private double computeProbability(String token) {&lt;br /&gt;
        double hamFactor  = 0;&lt;br /&gt;
        double spamFactor = 0;&lt;br /&gt;
        &lt;br /&gt;
        boolean foundInHam = false;&lt;br /&gt;
        boolean foundInSpam = false;&lt;br /&gt;
        &lt;br /&gt;
        double minThreshold = 0.01;&lt;br /&gt;
        double maxThreshold = 0.99;&lt;br /&gt;
        &lt;br /&gt;
        if (hamTokenCounts.containsKey(token)) {&lt;br /&gt;
            foundInHam = true;&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        if (spamTokenCounts.containsKey(token)) {&lt;br /&gt;
            foundInSpam = true;&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        if (foundInHam) {&lt;br /&gt;
            hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();&lt;br /&gt;
            if (!foundInSpam) {&lt;br /&gt;
                minThreshold = (hamFactor &amp;gt; 20) ? 0.0001 : 0.0002;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        if (foundInSpam) {&lt;br /&gt;
            spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();&lt;br /&gt;
            if (!foundInHam) {&lt;br /&gt;
                maxThreshold = (spamFactor &amp;gt; 10) ? 0.9999 : 0.9998;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        if ((hamFactor + spamFactor) &amp;lt; 5) {&lt;br /&gt;
            //This token hasn&amp;quot;t been seen enough.&lt;br /&gt;
            return 0.4;&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);&lt;br /&gt;
        double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);&lt;br /&gt;
        &lt;br /&gt;
        return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Returns a SortedSet of TokenProbabilityStrength built from the&lt;br /&gt;
     * Corpus and the tokens passed in the &amp;quot;tokens&amp;quot; Set.&lt;br /&gt;
     * The ordering is from the highest strength to the lowest strength.&lt;br /&gt;
     *&lt;br /&gt;
     * @param tokens&lt;br /&gt;
     * @param workCorpus&lt;br /&gt;
     * @return  SortedSet of TokenProbabilityStrength objects.&lt;br /&gt;
     */&lt;br /&gt;
    private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {&lt;br /&gt;
        //Convert to a SortedSet of token probability strengths.&lt;br /&gt;
        SortedSet tokenProbabilityStrengths = new TreeSet();&lt;br /&gt;
        &lt;br /&gt;
        Iterator i = tokens.iterator();&lt;br /&gt;
        while (i.hasNext()) {&lt;br /&gt;
            TokenProbabilityStrength tps = new TokenProbabilityStrength();&lt;br /&gt;
            &lt;br /&gt;
            tps.token = (String) i.next();&lt;br /&gt;
            &lt;br /&gt;
            if (workCorpus.containsKey(tps.token)) {&lt;br /&gt;
                tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());&lt;br /&gt;
            }&lt;br /&gt;
            else {&lt;br /&gt;
                //This token has never been seen before,&lt;br /&gt;
                //we&amp;quot;ll give it initially the default probability.&lt;br /&gt;
                Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);&lt;br /&gt;
                tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);&lt;br /&gt;
                boolean isTokenDegeneratedFound = false;&lt;br /&gt;
                &lt;br /&gt;
                Collection degeneratedTokens = buildDegenerated(tps.token);&lt;br /&gt;
                Iterator iDegenerated = degeneratedTokens.iterator();&lt;br /&gt;
                String tokenDegenerated = null;&lt;br /&gt;
                double strengthDegenerated;&lt;br /&gt;
                while (iDegenerated.hasNext()) {&lt;br /&gt;
                    tokenDegenerated = (String) iDegenerated.next();&lt;br /&gt;
                    if (workCorpus.containsKey(tokenDegenerated)) {&lt;br /&gt;
                        Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);&lt;br /&gt;
                        strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());&lt;br /&gt;
                        if (strengthDegenerated &amp;gt; tps.strength) {&lt;br /&gt;
                            isTokenDegeneratedFound = true;&lt;br /&gt;
                            tps.strength = strengthDegenerated;&lt;br /&gt;
                            corpusProbability = probabilityTemp;&lt;br /&gt;
                        }&lt;br /&gt;
                    }&lt;br /&gt;
                }&lt;br /&gt;
                // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default&lt;br /&gt;
                if (isTokenDegeneratedFound) {&lt;br /&gt;
                    synchronized(workCorpus) {&lt;br /&gt;
                        workCorpus.put(tps.token, corpusProbability);&lt;br /&gt;
                    }&lt;br /&gt;
                }&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            tokenProbabilityStrengths.add(tps);&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        return tokenProbabilityStrengths;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    private Collection buildDegenerated(String fullToken) {&lt;br /&gt;
        ArrayList tokens = new ArrayList();&lt;br /&gt;
        String header;&lt;br /&gt;
        String token;&lt;br /&gt;
        &lt;br /&gt;
        // look for a header string termination&lt;br /&gt;
        int headerEnd = fullToken.indexOf(&amp;quot;:&amp;quot;);&lt;br /&gt;
        if (headerEnd &amp;gt;= 0) {&lt;br /&gt;
            header = fullToken.substring(0, headerEnd);&lt;br /&gt;
            token = fullToken.substring(headerEnd);&lt;br /&gt;
        } else {&lt;br /&gt;
            header = &amp;quot;&amp;quot;;&lt;br /&gt;
            token = fullToken;&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        int end = token.length();&lt;br /&gt;
        do {&lt;br /&gt;
            if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {&lt;br /&gt;
                tokens.add(header + token.substring(0, end).toLowerCase());&lt;br /&gt;
                if (header.length() &amp;gt; 0) {&lt;br /&gt;
                    tokens.add(token.substring(0, end).toLowerCase());&lt;br /&gt;
                }&lt;br /&gt;
            }&lt;br /&gt;
            if (end &amp;gt; 1 &amp;amp;&amp;amp; token.charAt(0) &amp;gt;= &amp;quot;A&amp;quot; &amp;amp;&amp;amp; token.charAt(0) &amp;lt;= &amp;quot;Z&amp;quot;) {&lt;br /&gt;
                tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());&lt;br /&gt;
                if (header.length() &amp;gt; 0) {&lt;br /&gt;
                    tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());&lt;br /&gt;
                }&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            if (token.charAt(end - 1) != &amp;quot;!&amp;quot;) {&lt;br /&gt;
                break;&lt;br /&gt;
            }&lt;br /&gt;
            &lt;br /&gt;
            end--;&lt;br /&gt;
            &lt;br /&gt;
            tokens.add(header + token.substring(0, end));&lt;br /&gt;
            if (header.length() &amp;gt; 0) {&lt;br /&gt;
                tokens.add(token.substring(0, end));&lt;br /&gt;
            }&lt;br /&gt;
        } while (end &amp;gt; 0);&lt;br /&gt;
        &lt;br /&gt;
        return tokens;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    /**&lt;br /&gt;
     * Compute the spamminess probability of the interesting tokens in&lt;br /&gt;
     * the tokenProbabilities SortedSet.&lt;br /&gt;
     *&lt;br /&gt;
     * @param tokenProbabilities&lt;br /&gt;
     * @param workCorpus&lt;br /&gt;
     * @return  Computed spamminess.&lt;br /&gt;
     */&lt;br /&gt;
    private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {&lt;br /&gt;
        double p = 1.0;&lt;br /&gt;
        double np = 1.0;&lt;br /&gt;
        double tempStrength = 0.5;&lt;br /&gt;
        int count = MAX_INTERESTING_TOKENS;&lt;br /&gt;
        Iterator iterator = tokenProbabilityStrengths.iterator();&lt;br /&gt;
        while ((iterator.hasNext()) &amp;amp;&amp;amp; (count-- &amp;gt; 0 || tempStrength &amp;gt;= INTERESTINGNESS_THRESHOLD)) {&lt;br /&gt;
            TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();&lt;br /&gt;
            tempStrength = tps.strength;&lt;br /&gt;
            &lt;br /&gt;
            //      System.out.println(tps);&lt;br /&gt;
            &lt;br /&gt;
            double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default&lt;br /&gt;
            Double theDoubleObject = (Double) workCorpus.get(tps.token);&lt;br /&gt;
            // if either the original token or a degeneration was found use the double value, otherwise use the default&lt;br /&gt;
            if (theDoubleObject != null) {&lt;br /&gt;
                theDoubleValue = theDoubleObject.doubleValue();&lt;br /&gt;
            }&lt;br /&gt;
            p *= theDoubleValue;&lt;br /&gt;
            np *= (1.0 - theDoubleValue);&lt;br /&gt;
            // System.out.println(&amp;quot;Token:&amp;quot; + tps.token + &amp;quot;, p=&amp;quot; + theDoubleValue + &amp;quot;, overall p=&amp;quot; + p / (p + np));&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        return (p / (p + np));&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    private boolean allSameChar(String s) {&lt;br /&gt;
        if (s.length() &amp;lt; 2) {&lt;br /&gt;
            return false;&lt;br /&gt;
        }&lt;br /&gt;
        &lt;br /&gt;
        char c = s.charAt(0);&lt;br /&gt;
        &lt;br /&gt;
        for (int i = 1; i &amp;lt; s.length(); i++) {&lt;br /&gt;
            if (s.charAt(i) != c) {&lt;br /&gt;
                return false;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        return true;&lt;br /&gt;
    }&lt;br /&gt;
    &lt;br /&gt;
    private boolean allDigits(String s) {&lt;br /&gt;
        for (int i = 0; i &amp;lt; s.length(); i++) {&lt;br /&gt;
            if (!Character.isDigit(s.charAt(i))) {&lt;br /&gt;
                return false;&lt;br /&gt;
            }&lt;br /&gt;
        }&lt;br /&gt;
        return true;&lt;br /&gt;
    }&lt;br /&gt;
}&lt;br /&gt;
   &lt;br /&gt;
    &lt;br /&gt;
    &lt;br /&gt;
  &amp;lt;/source&amp;gt;&lt;br /&gt;
    &lt;br /&gt;
   &lt;br /&gt;
  &amp;lt;!-- end source code --&amp;gt;&lt;/div&gt;</summary>
			</entry>

	</feed>