You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/07/02 21:32:07 UTC
svn commit: r208869 [1/12] - in /lucene/nutch/trunk: conf/ src/plugin/languageidentifier/ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/

Author: ab
Date: Sat Jul  2 12:32:05 2005
New Revision: 208869

URL: http://svn.apache.org/viewcvs?rev=208869&view=rev
Log:
Improvements and fixes in NUTCH-60. Submitted by Jerome Charron.

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/plugin/languageidentifier/build.xml
    lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/da.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/de.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/el.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/en.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/es.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fi.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fr.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/hu.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/it.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/nl.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pl.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pt.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/sv.ngp
    lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestNGramProfile.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=208869&r1=208868&r2=208869&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jul  2 12:32:05 2005
@@ -727,4 +727,38 @@
   </description>
 </property>
 
+<!-- language-identifier plugin properties -->
+
+<property>
+  <name>lang.ngram.min.length</name>
+  <value>1</value>
+  <description> The minimum size of ngrams to uses to identify
+  language (must be between 1 and lang.ngram.max.length).
+  The larger is the range between lang.ngram.min.length and
+  lang.ngram.max.length, the better is the identification, but
+  the slowest it is.
+  </description>
+</property>
+
+<property>
+  <name>lang.ngram.max.length</name>
+  <value>4</value>
+  <description> The maximum size of ngrams to uses to identify
+  language (must be between lang.ngram.min.length and 4).
+  The larger is the range between lang.ngram.min.length and
+  lang.ngram.max.length, the better is the identification, but
+  the slowest it is.
+  </description>
+</property>
+
+<property>
+  <name>lang.analyze.max.length</name>
+  <value>2048</value>
+  <description> The maximum bytes of data to uses to indentify
+  the language (0 means full content analysis).
+  The larger is this value, the better is the analysis, but the
+  slowest it is.
+  </description>
+</property>
+
 </nutch-conf>

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/build.xml?rev=208869&r1=208868&r2=208869&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/build.xml Sat Jul  2 12:32:05 2005
@@ -9,6 +9,10 @@
     <copy todir="${build.classes}">
       <fileset dir="${src.dir}" includes="**/*.ngp, **/*.properties"/>
     </copy>
+    <echo>Copying test files</echo>
+    <copy todir="${build.test}">
+      <fileset dir="${src.test}" includes="**/*.test, **/*.txt"/>
+    </copy>
   </target>
 	
 </project>

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml?rev=208869&r1=208868&r2=208869&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml Sat Jul  2 12:32:05 2005
@@ -34,7 +34,7 @@
               name="Nutch language identifier filter"
               point="org.apache.nutch.indexer.IndexingFilter">
       <implementation id="LanguageIdentifier"
-                      class="org.apache.nutch.analysis.lang.LanguageIdentifier"/>
+                      class="org.apache.nutch.analysis.lang.LanguageIndexingFilter"/>
    </extension>
 
 

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=208869&r1=208868&r2=208869&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Sat Jul  2 12:32:05 2005
@@ -15,83 +15,153 @@
  */
 package org.apache.nutch.analysis.lang;
 
-import java.io.BufferedReader;
+// JDK imports
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
 import java.io.InputStream;
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
 import java.io.InputStreamReader;
-import java.util.Iterator;
+import java.util.List;
 import java.util.Vector;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.Properties;
+import java.util.Enumeration;
 import java.util.logging.Logger;
 
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
+// Nutch imports
+import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParserNotFound;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.LogFormatter;
 
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import java.util.Properties;
-import java.util.Enumeration;
 
 /**
  * 
  * @author Sami Siren
- *  
+ * @author Jerome Charron
  */
-public class LanguageIdentifier implements IndexingFilter {
-  public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.analysis.lang.LanguageIdentifier");
+public class LanguageIdentifier {
+  
+ 
+  private final static int DEFAULT_ANALYSIS_LENGTH = 0;    // 0 means full content
+  
+  private final static float SCORE_THRESOLD = 0.00F;
+
+  public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName());
+
+  
+  private ArrayList languages = new ArrayList();
+
+  private ArrayList supportedLanguages = new ArrayList();
+
+  /** Minimum size of NGrams */
+  private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+  
+  /** Maximum size of NGrams */
+  private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+  
+  /** The maximum amount of data to analyze */
+  private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+  
+  /** A global index of ngrams of all supported languages */
+  private HashMap ngramsIdx = new HashMap();
 
-  private Vector languages = new Vector();
+  /** The NGramProfile used for identification */
+  private NGramProfile suspect = null;
 
-  private Vector supportedLanguages = new Vector();
+  /** My singleton instance */
+  private static LanguageIdentifier identifier = null;
 
-  private static LanguageIdentifier identifier = new LanguageIdentifier(true);
 
-  private static float SCORE_THRESOLD = 0.00F;
-
-  //public constructor needed for extension mechanism
-  public LanguageIdentifier() {}
+  /**
+   * Constructs a new Language Identifier.
+   */
+  private LanguageIdentifier() {
 
-  private LanguageIdentifier(boolean fake) {
+    // Gets ngram sizes to take into account from the Nutch Config
+    minLength = NutchConf.get().getInt("lang.ngram.min.length",
+                                       NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
+    maxLength = NutchConf.get().getInt("lang.ngram.max.length",
+                                       NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
+    // Ensure the min and max values are in an acceptale range
+    // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
+    maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
+    maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+    minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+    minLength = Math.min(minLength, maxLength);
+
+    // Gets the value of the maximum size of data to analyze
+    analyzeLength = NutchConf.get().getInt("lang.analyze.max.length",
+                                           DEFAULT_ANALYSIS_LENGTH);
+    
     Properties p = new Properties();
     try {
       p.load(this.getClass().getResourceAsStream("langmappings.properties"));
 
       Enumeration alllanguages = p.keys();
+      
+      LOG.info(new StringBuffer()
+                .append("Language identifier configuration [")
+                .append(minLength).append("-").append(maxLength)
+                .append("/").append(analyzeLength).append("]").toString());
 
       StringBuffer list = new StringBuffer("Language identifier plugin supports:");
+      HashMap tmpIdx = new HashMap();
       while (alllanguages.hasMoreElements()) {
         String lang = (String) (alllanguages.nextElement());
 
         InputStream is = this.getClass().getClassLoader().getResourceAsStream(
-                "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION);
+                "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.FILE_EXTENSION);
 
         if (is != null) {
-          NGramProfile profile = new NGramProfile(lang);
+          NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
           try {
             profile.load(is);
             languages.add(profile);
             supportedLanguages.add(lang);
-            list.append(" " + lang);
+            List ngrams = profile.getSorted();
+            for (int i=0; i<ngrams.size(); i++) {
+                NGramEntry entry = (NGramEntry) ngrams.get(i);
+                List registered = (List) tmpIdx.get(entry);
+                if (registered == null) {
+                    registered = new ArrayList();
+                    tmpIdx.put(entry, registered);
+                }
+                registered.add(entry);
+                entry.setProfile(profile);
+            }
+            list.append(" " + lang + "(" + ngrams.size() + ")");
             is.close();
           } catch (IOException e1) {
             LOG.severe(e1.toString());
           }
         }
       }
+      // transform all ngrams lists to arrays for performances
+      Iterator keys = tmpIdx.keySet().iterator();
+      while (keys.hasNext()) {
+        NGramEntry entry = (NGramEntry) keys.next();
+        List l = (List) tmpIdx.get(entry);
+        if (l != null) {
+          NGramEntry[] array = (NGramEntry[]) l.toArray(new NGramEntry[l.size()]);
+          ngramsIdx.put(entry.getSeq(), array);
+        }
+      }
       LOG.info(list.toString());
+      // Create the suspect profile
+      suspect = new NGramProfile("suspect", minLength, maxLength);
     } catch (Exception e) {
       LOG.severe(e.toString());
     }
@@ -101,6 +171,13 @@
    * return handle to singleton instance
    */
   public static LanguageIdentifier getInstance() {
+    if (identifier == null) {
+        synchronized(LanguageIdentifier.class) {
+            if (identifier == null) {
+                identifier = new LanguageIdentifier();
+            }
+        }
+    }
     return identifier;
   }
 
@@ -157,15 +234,24 @@
       if (args[i].equals("-identifyfileset")) {
         command = IDFILESET;
         for (i++; i < args.length; i++) {
-          fileset.add(args[i]);
-          System.out.println(args[i]);
+          File[] files = null;
+          File f = new File(args[i]);
+          if (f.isDirectory()) {
+              files = f.listFiles();
+          } else {
+              files = new File[] { f };
+          }
+          for (int j=0; j<files.length; j++) {
+            fileset.add(files[j].getAbsolutePath());
+          }
         }
       }
 
     }
 
     String lang = null;
-    LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+    //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+    LanguageIdentifier idfr = new LanguageIdentifier();
     File f;
     FileInputStream fis;
     try {
@@ -205,9 +291,12 @@
           break;
 
         case IDFILESET:
+          /* used for benchs
+          for (int j=128; j<=524288; j*=2) {
+            long start = System.currentTimeMillis();
+            idfr.analyzeLength = j; */
           System.out.println("FILESET");
           Iterator i = fileset.iterator();
-
           while (i.hasNext()) {
             try {
               filename = (String) i.next();
@@ -218,12 +307,13 @@
             } catch (Exception e) {
               System.out.println(e);
             }
-
             System.out.println(filename + " was identified as " + lang);
           }
+          /* used for benchs
+            System.out.println(j + "/" + (System.currentTimeMillis()-start));
+          } */
           System.exit(0);
           break;
-
       }
     } catch (Exception e) {
       System.out.println(e);
@@ -261,46 +351,57 @@
   /**
    * Identify language based on submitted content
    * 
-   * @param text text of doc
+   * @param text to analyze
    * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
    *         unknown
    */
   public String identify(String text) {
-
     return identify(new StringBuffer(text));
   }
 
-  public String identify(StringBuffer text) {
+  /**
+   * Identify language based on submitted content
+   * 
+   * @param text to analyze
+   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
+   *         unknown
+   */
+  public String identify(StringBuffer content) {
 
-    NGramProfile p = new NGramProfile("suspect");
-    p.analyze(text);
+    StringBuffer text = content;
+    if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
+        text = new StringBuffer().append(content);
+        text.setLength(analyzeLength);
+    }
 
-    float topscore = Float.MAX_VALUE;
+    suspect.analyze(text);
+    Iterator iter = suspect.getSorted().iterator();
+    float topscore = Float.MIN_VALUE;
     String lang = "";
-
-    Iterator i = languages.iterator();
-    while (i.hasNext()) {
-
-      NGramProfile profile = (NGramProfile) i.next();
-      float score = profile.getSimilarity(p);
-
-      //LOG.fine(profile.getName() + ":" + score);
-
-      if (score < topscore) {
-        topscore = score;
-        lang = profile.getName();
-      }
+    HashMap scores = new HashMap();
+    NGramEntry searched = null;
+    
+    while (iter.hasNext()) {
+        searched = (NGramEntry) iter.next();
+        NGramEntry[] ngrams = (NGramEntry[]) ngramsIdx.get(searched.getSeq());
+        if (ngrams != null) {
+            for (int j=0; j<ngrams.length; j++) {
+                NGramProfile profile = ngrams[j].getProfile();
+                Float pScore = (Float) scores.get(profile);
+                if (pScore == null) {
+                    pScore = new Float(0);
+                }
+                float plScore = pScore.floatValue();
+                plScore += ngrams[j].getFrequency() + searched.getFrequency();
+                scores.put(profile, new Float(plScore));
+                if (plScore > topscore) {
+                    topscore = plScore;
+                    lang = profile.getName();
+                }
+            }
+        }
     }
-
-    p.ngrams.clear();
-    p = null;
-
-    LOG.finest("TOPSCORE: " + lang + " with " + topscore);
-
-    if (topscore > SCORE_THRESOLD)
-      return lang;
-
-    else return null;
+    return lang;
   }
 
   /**
@@ -313,42 +414,17 @@
   public String identify(InputStream is) throws IOException {
 
     StringBuffer text = new StringBuffer();
-    byte buffer[] = new byte[2000];
+    byte[] buffer = new byte[2048];
     int len = 0;
 
-    while ((len = is.read(buffer)) != -1) {
+    while (((len = is.read(buffer)) != -1) &&
+           ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+      if (analyzeLength != 0) {
+          len = Math.min(len, analyzeLength - text.length());
+      }
       text.append(new String(buffer, 0, len));
     }
-
-    return identify(text.toString());
-  }
-
-  public Document filter(Document doc, Parse parse, FetcherOutput fo) throws IndexingException {
-
-    //check if X-meta-lang found, possibly put there by HTMLLanguageParser
-    String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME);
-
-    //check if HTTP-header tels us the language
-    if (lang == null) lang = parse.getData().get("Content-Language");
-
-    if (lang == null) {
-      StringBuffer text = new StringBuffer();
-      /*
-       * String[] anchors = fo.getAnchors(); for (int i = 0; i < anchors.length;
-       * i++) { text+=anchors[i] + " "; }
-       */
-      text.append(parse.getData().getTitle()).append(" ");
-      text.append(parse.getText());
-      lang = LanguageIdentifier.getInstance().identify(text);
-    }
-
-    if (lang == null) {
-      lang = "unknown";
-    }
-
-    doc.add(Field.Keyword("lang", lang));
-
-    return doc;
+    return identify(text);
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=208869&r1=208868&r2=208869&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Sat Jul  2 12:32:05 2005
@@ -13,29 +13,34 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.analysis.lang;
 
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
+// JDK imports
 import java.io.File;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.OutputStream;
+import java.io.BufferedInputStream;
 import java.util.Date;
-import java.util.Collections;
-import java.util.Hashtable;
+import java.util.List;
 import java.util.Iterator;
-import java.util.Vector;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.logging.Logger;
 
+// Nutch imports
 import org.apache.nutch.util.LogFormatter;
 
+// Lucene imports
 import org.apache.lucene.analysis.Token;
 
+
 /**
  * This class runs a ngram analysis over submitted text, results might be used
  * for automatic language identifiaction.
@@ -45,257 +50,235 @@
  * Methods are provided to build new NGramProfiles profiles.
  * 
  * @author Sami Siren
+ * @author Jerome Charron - http://frutch.free.fr/
  */
 public class NGramProfile {
 
   public static final Logger LOG = LogFormatter
       .getLogger("org.apache.nutch.analysis.lang.NGramProfile");
 
-  private String name;
+  /** The minimum length allowed for a ngram. */
+  final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
 
-  private Vector sorted = null;
+  /** The maximum length allowed for a ngram. */
+  final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
+    
+  /** The default min length of ngram */
+  final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
 
-  private StringBuffer tokensb = new StringBuffer();
+  /** The default max length of ngram */
+  final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
 
-  private int min_ngram_length = DEFAULT_MIN_NGRAM_LENGTH;
+  /** The ngram profile file extension */
+  static final String FILE_EXTENSION = "ngp";
 
-  private int max_ngram_length = DEFAULT_MAX_NGRAM_LENGTH;
+  /** The profile max size (number of ngrams of the same size) */
+  static final int MAX_SIZE = 1000;
 
-  private int ngramcount = 0;
+  /** separator char */
+  static final char SEPARATOR = '_';
+  /** The String form of the separator char */  
+  private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
 
-  static final String NGRAM_FILE_EXTENSION = "ngp";
+  
+  /** The profile's name */
+  private String name = null;
 
-  static final int NGRAM_LENGTH = 1000;
+  /** The NGrams of this profile sorted on the number of occurences */
+  private List sorted = null;
 
-  //separator char
-  static final char SEPARATOR = '_';
+  /** The min length of ngram */
+  private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
 
-  //default min length of ngram
-  static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
+  /** The max length of ngram */
+  private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
 
-  //default max length of ngram
-  static final int DEFAULT_MAX_NGRAM_LENGTH = 4;
+  /** The total number of ngrams occurences */
+  private int[] ngramcounts = null;
 
-  //table to store ngrams
-  Hashtable ngrams = null;
+  /** An index of the ngrams of the profile */
+  private Map ngrams = null;
 
+  /** A StringBuffer used during analysis */
+  private QuickStringBuffer word = new QuickStringBuffer();
+  
+    
   /**
-   * private class used to store NGramEntry
+   * Construct a new ngram profile
+   * 
+   * @param name is the name of the profile
+   * @param minlen is the min length of ngram sequences
+   * @param maxlen is the max length of ngram sequences
    */
-  class NGramEntry implements Comparable {
-    private CharSequence seq;
-
-    private int count;
-
-    private float normalized_count;
-
-    public NGramEntry(CharSequence seq) {
-      this.seq = seq;
-    }
-
-    /**
-     * @param ngramsequence
-     * @param ngramcount
-     */
-    public NGramEntry(String ngramsequence, int ngramcount) {
-      seq = new StringBuffer(ngramsequence).subSequence(0, ngramsequence
-          .length());
-      this.count = ngramcount;
-    }
-
-    public int getCount() {
-      return count;
-    }
-
-    public CharSequence getSeq() {
-      return seq;
-    }
-
-    public int compareTo(Object o) {
-      if (((NGramEntry) o).count - count != 0)
-        return ((NGramEntry) o).count - count;
-      else
-        return (seq.toString().compareTo(((NGramEntry) o).seq.toString()));
-    }
-
-    public void inc() {
-      count++;
-    }
+  public NGramProfile(String name, int minlen, int maxlen) {
+    // TODO: Compute the initial capacity using minlen and maxlen.
+    this.ngrams = new HashMap(4000);
+    this.minLength = minlen;
+    this.maxLength = maxlen;
+    this.name = name;
   }
 
   /**
-   * Construct a new ngram profile
+   * @return Returns the name.
+   */
+  public String getName() {
+    return name;
+  }
+  
+  /**
+   * Add ngrams from a token to this profile
    * 
-   * @param name
-   *          Name of profile
+   * @param t is the Token to be added
    */
-  public NGramProfile(String name) {
-    this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
+  public void add(Token t) {
+    add(new StringBuffer().append(SEPARATOR)
+                          .append(t.termText())
+                          .append(SEPARATOR));
   }
 
   /**
-   * Construct a new ngram profile
+   * Add ngrams from a single word to this profile
    * 
-   * @param name
-   *          Name of profile
-   * @param minlen
-   *          min length of ngram sequences
-   * @param maxlen
-   *          max length of ngram sequences
+   * @param word is the word to add
    */
-  public NGramProfile(String name, int minlen, int maxlen) {
-    ngrams = new Hashtable();
-    this.max_ngram_length = maxlen;
-    this.min_ngram_length = minlen;
-    this.name = name;
+  public void add(StringBuffer word) {
+    for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
+      add(word, i);
+    }
   }
 
   /**
-   * Add ngrams from a token to this profile
-   * 
-   * @param t
-   *          Token to be added
+   * Add the last NGrams from the specified word.
    */
-  public void addFromToken(Token t) {
-    tokensb.setLength(0);
-    tokensb.append(SEPARATOR).append(t.termText()).append(SEPARATOR);
-    addNGrams(tokensb);
+  private void add(QuickStringBuffer word) {
+    int wlen = word.length();
+    if (wlen >= minLength) {
+        int max = Math.min(maxLength, wlen);
+        for (int i=minLength; i<=max; i++) {
+            add(word.subSequence(wlen-i, wlen));
+        }
+    }
+  }
+  
+  /**
+   * Add ngrams from a single word in this profile
+   *
+   * @param word is the word to add
+   * @param n is the ngram size
+   */
+  private void add(CharSequence cs) {
+
+    if (cs.equals(SEP_CHARSEQ)) { return; }
+    NGramEntry nge = (NGramEntry) ngrams.get(cs);
+    if (nge == null) {
+      nge = new NGramEntry(cs);
+      ngrams.put(cs, nge);
+    }
+    nge.inc();
   }
 
   /**
    * Analyze a piece of text
    * 
-   * @param text
-   *          the text to be analyzed
+   * @param text the text to be analyzed
    */
   public void analyze(StringBuffer text) {
-    StringBuffer word;
-    int i;
 
     if (ngrams != null) {
       ngrams.clear();
+      sorted = null;
     }
 
-    word = new StringBuffer().append(SEPARATOR);
-    for (i = 0; i < text.length(); i++) {
+    word.clear().append(SEPARATOR);
+    for (int i = 0; i < text.length(); i++) {
       char c = Character.toLowerCase(text.charAt(i));
 
       if (Character.isLetter(c)) {
-        word.append(c);
+        add(word.append(c));
       } else {
         //found word boundary
         if (word.length() > 1) {
           //we have a word!
-          word.append(SEPARATOR);
-          addNGrams(word);
-          word.delete(0, word.length());
+          add(word.append(SEPARATOR));
+          word.clear().append(SEPARATOR);
         }
       }
     }
 
     if (word.length() > 1) {
-      //we have a last word
-      word.append(SEPARATOR);
-      addNGrams(word);
+      //we have a word!
+      add(word.append(SEPARATOR));
     }
     normalize();
   }
 
   /**
-   * Normalize profile
-   */
-  protected void normalize() {
-    Vector sorted = getSorted();
-    int sum = 0;
-
-    //only calculate ngramcount if it was not available in profile
-    if (ngramcount == 0) {
-      for (int i = 0; i < sorted.size(); i++) {
-        ngramcount += ((NGramEntry) sorted.get(i)).count;
-      }
-    }
-
-    if (sorted.size() > 0) {
-      Iterator i = sorted.iterator();
-
-      while (i.hasNext()) {
-        NGramEntry e = (NGramEntry) i.next();
-        e.normalized_count = e.count / (float)ngramcount;
-      }
-    }
-  }
-
-  /**
-   * Add ngrams from a single word to this profile
-   * 
    * @param word
+   * @param n sequence length
    */
-  public void addNGrams(StringBuffer word) {
-    int i;
-
-    for (i = min_ngram_length; i <= max_ngram_length && i < word.length(); i++) {
-      addNGrams(word, i);
+  private void add(StringBuffer word, int n) {
+    for (int i=0; i <= word.length()-n; i++) {
+      add(word.subSequence(i, i + n));
     }
   }
-
+    
   /**
-   * @param word
-   * @param n
-   *          sequence length
+   * Normalize the profile (calculates the ngrams frequencies)
    */
-  private void addNGrams(StringBuffer word, int n) {
-    NGramEntry nge;
-    StringBuffer sb;
-    int i;
-
-    for (i = 0; i <= word.length() - n; i++) {
-
-      CharSequence cs = word.subSequence(i, i + n);
+  protected void normalize() {
 
-      if (ngrams.containsKey(cs)) {
-        nge = (NGramEntry) ngrams.get(cs);
-      } else {
-        nge = new NGramEntry(cs);
+    NGramEntry e = null;
+    //List sorted = getSorted();
+    Iterator i = ngrams.values().iterator();
+
+    // Calculate ngramcount if not already done
+    if (ngramcounts == null) {
+      ngramcounts = new int[maxLength+1];
+      while (i.hasNext()) {
+        e = (NGramEntry) i.next();
+        ngramcounts[e.size()] += e.count;
       }
-      nge.inc();
-      ngrams.put(cs, nge);
+    }
+    
+    i = ngrams.values().iterator();
+    while (i.hasNext()) {
+      e = (NGramEntry) i.next();
+      e.frequency = (float) e.count / (float) ngramcounts[e.size()];
     }
   }
 
   /**
-   * Return sorted vector of ngrams (sort done by 1. count 2. sequence)
+   * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
    * 
    * @return sorted vector of ngrams
    */
-  public Vector getSorted() {
-    //make sure srting is done only once
+  public List getSorted() {
+    // make sure sorting is done only once
     if (sorted == null) {
-      sorted = new Vector(ngrams.values());
+      sorted = new ArrayList(ngrams.values());
       Collections.sort(sorted);
 
-      //trim at NGRAM_LENGTH entries
-      if (sorted.size() > NGRAM_LENGTH)
-        sorted.setSize(NGRAM_LENGTH);
+      // trim at NGRAM_LENGTH entries
+      if (sorted.size() > MAX_SIZE) {
+        sorted = sorted.subList(0, MAX_SIZE);
+      }
     }
-
     return sorted;
   }
-
-  /**
-   * Return ngramprofile as text
-   * 
-   * @return ngramprofile as text
-   */
+  
+  // Inherited JavaDoc
   public String toString() {
-    StringBuffer s = new StringBuffer();
+
+    StringBuffer s = new StringBuffer().append("NGramProfile: ")
+                                       .append(name).append("\n");
 
     Iterator i = getSorted().iterator();
 
-    s.append("NGramProfile: ").append(name).append("\n");
     while (i.hasNext()) {
       NGramEntry entry = (NGramEntry) i.next();
-      s.append(entry.count).append(':').append(entry.seq).append(" ").append(
-          entry.normalized_count).append("\n");
+      s.append("[").append(entry.seq)
+       .append("/").append(entry.count)
+       .append("/").append(entry.frequency).append("]\n");
     }
     return s.toString();
   }
@@ -308,6 +291,7 @@
    * @return similarity 0=exact match
    */
   public float getSimilarity(NGramProfile another) {
+      
     float sum = 0;
 
     try {
@@ -315,21 +299,20 @@
       while (i.hasNext()) {
         NGramEntry other = (NGramEntry) i.next();
         if (ngrams.containsKey(other.seq)) {
-          sum += Math.abs((other.normalized_count - ((NGramEntry) ngrams
-              .get(other.seq)).normalized_count)) / 2;
+          sum += Math.abs((other.frequency -
+                          ((NGramEntry) ngrams.get(other.seq)).frequency)) / 2;
         } else {
-          sum += other.normalized_count;
+          sum += other.frequency;
         }
       }
       i = getSorted().iterator();
       while (i.hasNext()) {
         NGramEntry other = (NGramEntry) i.next();
         if (another.ngrams.containsKey(other.seq)) {
-          sum += Math
-              .abs((other.normalized_count - ((NGramEntry) another.ngrams
-                  .get(other.seq)).normalized_count)) / 2;
+          sum += Math.abs((other.frequency -
+                          ((NGramEntry) another.ngrams.get(other.seq)).frequency)) / 2;
         } else {
-          sum += other.normalized_count;
+          sum += other.frequency;
         }
       }
     } catch (Exception e) {
@@ -339,27 +322,29 @@
   }
 
   /**
-   * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
+   * Loads a ngram profile from an InputStream
+   * (assumes UTF-8 encoded content)
+   * @param is the InputStream to read
    */
   public void load(InputStream is) throws IOException {
-    BufferedReader bis = new BufferedReader(new InputStreamReader(is, "UTF-8"));
-    String line;
 
     ngrams.clear();
+    ngramcounts = new int[maxLength+1];
+    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+    String line = null;
 
-    while ((line = bis.readLine()) != null) {
+    while ((line = reader.readLine()) != null) {
 
       // # starts a comment line
       if (line.charAt(0) != '#') {
         int spacepos = line.indexOf(' ');
         String ngramsequence = line.substring(0, spacepos).trim();
-        int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
-
-        if (!line.startsWith("ngram_count")) {
-          NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
-          ngrams.put(en.getSeq(), en);
-        } else {
-          this.ngramcount = ngramcount;
+        int len = ngramsequence.length();
+        if ((len >= minLength) && (len <= maxLength)) {
+            int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+            NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+            ngrams.put(en.getSeq(), en);
+            ngramcounts[len] += ngramcount;
         }
       }
     }
@@ -369,16 +354,14 @@
   /**
    * Create a new Language profile from (preferably quite large) text file
    * 
-   * @param name
-   *          name of profile
-   * @param is
-   * @param encoding
-   *          encoding of stream
+   * @param name is thename of profile
+   * @param is is the stream to read
+   * @param encoding is the encoding of stream
    */
-  public static NGramProfile createNgramProfile(String name, InputStream is,
-      String encoding) {
+  public static NGramProfile create(String name, InputStream is, String encoding) {
 
-    NGramProfile newProfile = new NGramProfile(name);
+    NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
+                                                     ABSOLUTE_MAX_NGRAM_LENGTH);
     BufferedInputStream bis = new BufferedInputStream(is);
 
     byte buffer[] = new byte[4096];
@@ -394,7 +377,6 @@
     }
 
     newProfile.analyze(text);
-
     return newProfile;
   }
 
@@ -402,25 +384,42 @@
    * Writes NGramProfile content into OutputStream, content is outputted with
    * UTF-8 encoding
    * 
-   * @param os
-   *          Stream to output to
+   * @param os the Stream to output to
    * @throws IOException
    */
-
   public void save(OutputStream os) throws IOException {
-    Vector v = getSorted();
-    Iterator i = v.iterator();
-    os
-        .write(("# NgramProfile generated at " + new Date() + " for Nutch Language Identification\n")
-            .getBytes());
-    os.write(("ngram_count " + ngramcount + "\n").getBytes());
 
-    while (i.hasNext()) {
-      NGramEntry e = (NGramEntry) i.next();
-      String line = e.getSeq().toString() + " " + e.getCount() + "\n";
+    // Write header
+    os.write(("# NgramProfile generated at " + new Date() +
+              " for Nutch Language Identification\n").getBytes());
+
+    // And then each ngram
+    
+    // First dispatch ngrams in many lists depending on their size
+    // (one list for each size, in order to store MAX_SIZE ngrams for each
+    // size of ngram)
+    int count = 0;
+    List list = new ArrayList();
+    List sublist = new ArrayList();
+    NGramEntry[] entries = (NGramEntry[]) ngrams.values().toArray(new NGramEntry[ngrams.size()]);
+    for (int i=minLength; i<=maxLength; i++) {
+      for (int j=0; j<entries.length; j++) {
+        if (entries[j].getSeq().length() == i) {
+          sublist.add(entries[j]);
+        }
+      }
+      Collections.sort(sublist);
+      if (sublist.size() > MAX_SIZE) {
+        sublist = sublist.subList(0, MAX_SIZE);
+      }
+      list.addAll(sublist);
+      sublist.clear();
+    }
+    for (int i=0; i<list.size(); i++) {
+      NGramEntry e = (NGramEntry) list.get(i);
+      String line = e.toString() + " " + e.getCount() + "\n";
       os.write(line.getBytes("UTF-8"));
     }
-
     os.flush();
   }
 
@@ -431,7 +430,10 @@
    */
   public static void main(String args[]) {
 
-    String usage = "Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]";
+    String usage = "Usage: NGramProfile " +
+                   "[-create profilename filename encoding] " +
+                   "[-similarity file1 file2] "+
+                   "[-score profile-name filename encoding]";
     int command = 0;
 
     final int CREATE = 1;
@@ -442,7 +444,7 @@
     String filename = "";
     String filename2 = "";
     String encoding = "";
-
+    
     if (args.length == 0) {
       System.err.println(usage);
       System.exit(-1);
@@ -479,43 +481,40 @@
 
         File f = new File(filename);
         FileInputStream fis = new FileInputStream(f);
-        NGramProfile newProfile = NGramProfile.createNgramProfile(profilename,
-            fis, encoding);
+        NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
         fis.close();
-        f = new File(profilename + "." + NGRAM_FILE_EXTENSION);
+        f = new File(profilename + "." + FILE_EXTENSION);
         FileOutputStream fos = new FileOutputStream(f);
         newProfile.save(fos);
-        System.out.println("new profile " + profilename + "."
-            + NGRAM_FILE_EXTENSION + " was created.");
+        System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
         break;
 
       case SIMILARITY:
 
         f = new File(filename);
         fis = new FileInputStream(f);
-        newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
+        newProfile = NGramProfile.create(filename, fis, encoding);
         newProfile.normalize();
 
         f = new File(filename2);
         fis = new FileInputStream(f);
-        NGramProfile newProfile2 = NGramProfile.createNgramProfile(filename2,
-            fis, encoding);
+        NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
         newProfile2.normalize();
-        System.out.println("Similarity is "
-            + newProfile.getSimilarity(newProfile2));
+        System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
         break;
 
       case SCORE:
         f = new File(filename);
         fis = new FileInputStream(f);
-        newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
+        newProfile = NGramProfile.create(filename, fis, encoding);
 
-        f = new File(profilename + "." + NGRAM_FILE_EXTENSION);
+        f = new File(profilename + "." + FILE_EXTENSION);
         fis = new FileInputStream(f);
-        NGramProfile compare = new NGramProfile(profilename);
+        NGramProfile compare = new NGramProfile(profilename,
+                                                DEFAULT_MIN_NGRAM_LENGTH,
+                                                DEFAULT_MAX_NGRAM_LENGTH);
         compare.load(fis);
         System.out.println("Score is " + compare.getSimilarity(newProfile));
-
         break;
 
       }
@@ -525,18 +524,217 @@
     }
   }
 
+  
   /**
-   * @return Returns the name.
+   * Inner class that describes a NGram
    */
-  public String getName() {
-    return name;
+  class NGramEntry implements Comparable {
+
+    /** The NGRamProfile this NGram is related to */
+    private NGramProfile profile = null;
+
+    /** The sequence of characters of the ngram */
+    CharSequence seq = null;
+
+    /** The number of occurences of this ngram in its profile */
+    private int count = 0;
+
+    /** The frequency of this ngram in its profile */
+    private float frequency = 0.0F;
+
+    
+    /** 
+     * Constructs a new NGramEntry
+     * @param seq is the sequence of characters of the ngram
+     */
+    public NGramEntry(CharSequence seq) {
+      this.seq = seq;
+    }
+
+    /** 
+     * Constructs a new NGramEntry
+     * @param seq is the sequence of characters of the ngram
+     * @param count is the number of occurences of this ngram
+     */
+    public NGramEntry(String seq, int count) {
+      this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+      this.count = count;
+    }
+
+    
+    /**
+     * Returns the number of occurences of this ngram in its profile
+     * @return the number of occurences of this ngram in its profile
+     */
+    public int getCount() {
+      return count;
+    }
+    
+    /**
+     * Returns the frequency of this ngram in its profile
+     * @return the frequency of this ngram in its profile
+     */
+    public float getFrequency() {
+        return frequency;
+    }
+
+    /**
+     * Returns the sequence of characters of this ngram
+     * @return the sequence of characters of this ngram
+     */
+    public CharSequence getSeq() {
+      return seq;
+    }
+
+    /**
+     * Returns the size of this ngram
+     * @return the size of this ngram
+     */
+    public int size() {
+        return seq.length();
+    }
+    
+    // Inherited JavaDoc
+    public int compareTo(Object o) {
+      NGramEntry ngram = (NGramEntry) o;
+      int diff = Float.compare(ngram.getFrequency(), frequency);
+      if (diff != 0) {
+        return diff;
+      } else {
+        return (toString().compareTo(ngram.toString()));
+      }
+    }
+
+    /**
+     * Increments the number of occurences of this ngram.
+     */
+    public void inc() {
+      count++;
+    }
+
+    /**
+     * Associated a profile to this ngram
+     * @param profile is the profile associated to this ngram
+     */
+    public void setProfile(NGramProfile profile) {
+        this.profile = profile;
+    }
+
+    /**
+     * Returns the profile associated to this ngram
+     * @return the profile associated to this ngram
+     */
+    public NGramProfile getProfile() {
+        return profile;
+    }
+
+    // Inherited JavaDoc
+    public String toString() {
+        return seq.toString();
+    }
+
+    // Inherited JavaDoc
+    public int hashCode() {
+        return seq.hashCode();
+    }
+    
+    // Inherited JavaDoc
+    public boolean equals(Object obj) {
+        
+        NGramEntry ngram = null;
+        try {
+            ngram = (NGramEntry) obj;
+            return ngram.seq.equals(seq);
+        } catch (Exception e) {
+            return false;
+        }
+    }
+
   }
 
-  /**
-   * @param name
-   *          The name to set.
-   */
-  public void setName(String name) {
-    this.name = name;
+  
+  private class QuickStringBuffer implements CharSequence {
+
+    private char value[];
+
+    private int count;
+
+    QuickStringBuffer() {
+      this(16);
+    }
+
+    QuickStringBuffer(char[] value) {
+      this.value = value;
+      count = value.length;
+    }
+    
+    QuickStringBuffer(int length) {
+      value = new char[length];
+    }
+
+    QuickStringBuffer(String str) {
+      this(str.length() + 16);
+      append(str);
+    }
+
+    public int length() {
+      return count;
+    }
+
+    private void expandCapacity(int minimumCapacity) {
+      int newCapacity = (value.length + 1) * 2;
+      if (newCapacity < 0) {
+        newCapacity = Integer.MAX_VALUE;
+      } else if (minimumCapacity > newCapacity) {
+          newCapacity = minimumCapacity;
+      }
+	
+      char newValue[] = new char[newCapacity];
+      System.arraycopy(value, 0, newValue, 0, count);
+      value = newValue;
+    }
+
+    QuickStringBuffer clear() {
+      count = 0;
+      return this;
+    }
+
+    public char charAt(int index) {
+      return value[index];
+    }
+
+    QuickStringBuffer append(String str) {
+      if (str == null) {
+        str = String.valueOf(str);
+      }
+
+      int len = str.length();
+      int newcount = count + len;
+      if (newcount > value.length) {
+        expandCapacity(newcount);
+      }
+      str.getChars(0, len, value, count);
+      count = newcount;
+      return this;
+    }
+
+    QuickStringBuffer append(char c) {
+      int newcount = count + 1;
+      if (newcount > value.length) {
+        expandCapacity(newcount);
+      }
+      value[count++] = c;
+      return this;
+    }
+
+    public CharSequence subSequence(int start, int end) {
+      return new String(value, start, end - start);
+    }
+        
+    public String toString() {
+      return new String(this.value);
+    }
   }
+  
+  
 }