You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/31 09:43:34 UTC
svn commit: r799534 - in /lucene/tika/trunk/tika-core/src: main/java/org/apache/tika/language/ test/java/org/apache/tika/language/

Author: jukka
Date: Fri Jul 31 07:43:34 2009
New Revision: 799534

URL: http://svn.apache.org/viewvc?rev=799534&view=rev
Log:
TIKA-209: Language detection is weak.

Re-indent to match Java coding conventions.

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Jul 31 07:43:34 2009
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,21 +16,20 @@
  */
 package org.apache.tika.language;
 
-// JDK imports
-import java.io.File;
-import java.io.InputStream;
-import java.io.IOException;
 import java.io.BufferedReader;
 import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.util.List;
-import java.util.Vector;
+import java.util.ArrayList;
+import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.Iterator;
-import java.util.ArrayList;
+import java.util.List;
 import java.util.Properties;
-import java.util.Enumeration;
+import java.util.Vector;
 
 import org.apache.tika.language.NGramProfile.NGramEntry;
 
@@ -44,350 +43,337 @@
  * @author J&eacute;r&ocirc;me Charron
  */
 public class LanguageIdentifier {
-  
- 
-  private final static int DEFAULT_ANALYSIS_LENGTH = 0;    // 0 means full content
-
-  private ArrayList<NGramProfile> languages = new ArrayList<NGramProfile>();
-
-  private ArrayList<String> supportedLanguages = new ArrayList<String>();
-
-  /** Minimum size of NGrams */
-  private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
-  
-  /** Maximum size of NGrams */
-  private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
-  
-  /** The maximum amount of data to analyze */
-  private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
-  
-  /** A global index of ngrams of all supported languages */
-  private HashMap<CharSequence, NGramEntry[]> ngramsIdx = new HashMap<CharSequence, NGramEntry[]>();
-
-  /** The NGramProfile used for identification */
-  private NGramProfile suspect = null;
-
-
-  /**
-   * Constructs a new Language Identifier.
-   */
-  public LanguageIdentifier() {
-
-    // Gets ngram sizes to take into account from the Nutch Config
-    minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
-    maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
-    // Ensure the min and max values are in an acceptale range
-    // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
-    maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
-    maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
-    minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
-    minLength = Math.min(minLength, maxLength);
-
-    // Gets the value of the maximum size of data to analyze
-    analyzeLength = DEFAULT_ANALYSIS_LENGTH;
-    
-    Properties p = new Properties();
-    try {
-      p.load(this.getClass().getResourceAsStream("langmappings.properties"));
-
-      Enumeration alllanguages = p.keys();
-
-      StringBuffer list = new StringBuffer("Language identifier plugin supports:");
-      HashMap<NGramEntry, List<NGramEntry>> tmpIdx = new HashMap<NGramEntry, List<NGramEntry>>();
-      while (alllanguages.hasMoreElements()) {
-        String lang = (String) (alllanguages.nextElement());
-
-        InputStream is = this.getClass().getClassLoader().getResourceAsStream(
-                "org/apache/tika/language/" + lang + "." + NGramProfile.FILE_EXTENSION);
-
-        if (is != null) {
-          NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
-          try {
-            profile.load(is);
-            languages.add(profile);
-            supportedLanguages.add(lang);
-            List<NGramEntry> ngrams = profile.getSorted();
-            for (int i=0; i<ngrams.size(); i++) {
-                NGramEntry entry = ngrams.get(i);
-                List<NGramEntry> registered = tmpIdx.get(entry);
-                if (registered == null) {
-                    registered = new ArrayList<NGramEntry>();
-                    tmpIdx.put(entry, registered);
+
+    private final static int DEFAULT_ANALYSIS_LENGTH = 0;    // 0 means full content
+
+    private ArrayList<NGramProfile> languages = new ArrayList<NGramProfile>();
+
+    private ArrayList<String> supportedLanguages = new ArrayList<String>();
+
+    /** Minimum size of NGrams */
+    private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+
+    /** Maximum size of NGrams */
+    private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+
+    /** The maximum amount of data to analyze */
+    private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+
+    /** A global index of ngrams of all supported languages */
+    private HashMap<CharSequence, NGramEntry[]> ngramsIdx = new HashMap<CharSequence, NGramEntry[]>();
+
+    /** The NGramProfile used for identification */
+    private NGramProfile suspect = null;
+
+    /**
+     * Constructs a new Language Identifier.
+     */
+    public LanguageIdentifier() {
+
+        // Gets ngram sizes to take into account from the Nutch Config
+        minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+        maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+        // Ensure the min and max values are in an acceptale range
+        // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
+        maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
+        maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+        minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+        minLength = Math.min(minLength, maxLength);
+
+        // Gets the value of the maximum size of data to analyze
+        analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+
+        Properties p = new Properties();
+        try {
+            p.load(this.getClass().getResourceAsStream("langmappings.properties"));
+
+            Enumeration alllanguages = p.keys();
+
+            StringBuffer list = new StringBuffer("Language identifier plugin supports:");
+            HashMap<NGramEntry, List<NGramEntry>> tmpIdx = new HashMap<NGramEntry, List<NGramEntry>>();
+            while (alllanguages.hasMoreElements()) {
+                String lang = (String) (alllanguages.nextElement());
+
+                InputStream is = this.getClass().getClassLoader().getResourceAsStream(
+                        "org/apache/tika/language/" + lang + "." + NGramProfile.FILE_EXTENSION);
+
+                if (is != null) {
+                    NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
+                    try {
+                        profile.load(is);
+                        languages.add(profile);
+                        supportedLanguages.add(lang);
+                        List<NGramEntry> ngrams = profile.getSorted();
+                        for (int i=0; i<ngrams.size(); i++) {
+                            NGramEntry entry = ngrams.get(i);
+                            List<NGramEntry> registered = tmpIdx.get(entry);
+                            if (registered == null) {
+                                registered = new ArrayList<NGramEntry>();
+                                tmpIdx.put(entry, registered);
+                            }
+                            registered.add(entry);
+                            entry.setProfile(profile);
+                        }
+                        list.append(" " + lang + "(" + ngrams.size() + ")");
+                        is.close();
+                    } catch (IOException e1) {
+                        // if (LOG.isFatalEnabled()) { LOG.fatal(e1.toString()); }
+                    }
                 }
-                registered.add(entry);
-                entry.setProfile(profile);
             }
-            list.append(" " + lang + "(" + ngrams.size() + ")");
-            is.close();
-          } catch (IOException e1) {
-            // if (LOG.isFatalEnabled()) { LOG.fatal(e1.toString()); }
-          }
-        }
-      }
-      // transform all ngrams lists to arrays for performances
-      Iterator<NGramEntry> keys = tmpIdx.keySet().iterator();
-      while (keys.hasNext()) {
-        NGramEntry entry = keys.next();
-        List<NGramEntry> l = tmpIdx.get(entry);
-        if (l != null) {
-          NGramEntry[] array = l.toArray(new NGramEntry[l.size()]);
-          ngramsIdx.put(entry.getSeq(), array);
+            // transform all ngrams lists to arrays for performances
+            Iterator<NGramEntry> keys = tmpIdx.keySet().iterator();
+            while (keys.hasNext()) {
+                NGramEntry entry = keys.next();
+                List<NGramEntry> l = tmpIdx.get(entry);
+                if (l != null) {
+                    NGramEntry[] array = l.toArray(new NGramEntry[l.size()]);
+                    ngramsIdx.put(entry.getSeq(), array);
+                }
+            }
+            // Create the suspect profile
+            suspect = new NGramProfile("suspect", minLength, maxLength);
+        } catch (Exception e) {
+            e.printStackTrace();
+            // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
         }
-      }
-      // Create the suspect profile
-      suspect = new NGramProfile("suspect", minLength, maxLength);
-    } catch (Exception e) {
-        e.printStackTrace();
-      // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
     }
-  }
 
+    /**
+     * Main method used for command line process.
+     * <br/>Usage is:
+     * <pre>
+     * LanguageIdentifier [-identifyrows filename maxlines]
+     *                    [-identifyfile charset filename]
+     *                    [-identifyfileset charset files]
+     *                    [-identifytext text]
+     *                    [-identifyurl url]
+     * </pre>
+     * @param args arguments.
+     */
+    public static void main(String args[]) {
+        String usage = "Usage: LanguageIdentifier" 
+            + " [-identifyrows filename maxlines]"
+            + " [-identifyfile charset filename]"
+            + " [-identifyfileset charset files]"
+            + " [-identifytext text] ";
+        int command = 0;
+
+        final int IDFILE = 1;
+        final int IDTEXT = 2;
+        final int IDFILESET = 4;
+        final int IDROWS = 5;
+
+        Vector<String> fileset = new Vector<String>();
+        String filename = "";
+        String charset = "";
+        String text = "";
+        int max = 0;
+
+        if (args.length == 0) {
+            System.err.println(usage);
+            System.exit(-1);
+        }
 
-  /**
-   * Main method used for command line process.
-   * <br/>Usage is:
-   * <pre>
-   * LanguageIdentifier [-identifyrows filename maxlines]
-   *                    [-identifyfile charset filename]
-   *                    [-identifyfileset charset files]
-   *                    [-identifytext text]
-   *                    [-identifyurl url]
-   * </pre>
-   * @param args arguments.
-   */
-  public static void main(String args[]) {
-
-    String usage = "Usage: LanguageIdentifier "            +
-                      "[-identifyrows filename maxlines] " +
-                      "[-identifyfile charset filename] "  +
-                      "[-identifyfileset charset files] "  +
-                      "[-identifytext text] ";
-    int command = 0;
-
-    final int IDFILE = 1;
-    final int IDTEXT = 2;
-    final int IDFILESET = 4;
-    final int IDROWS = 5;
-
-    Vector<String> fileset = new Vector<String>();
-    String filename = "";
-    String charset = "";
-    String text = "";
-    int max = 0;
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
+        for (int i = 0; i < args.length; i++) { // parse command line
+            if (args[i].equals("-identifyfile")) {
+                command = IDFILE;
+                charset = args[++i];
+                filename = args[++i];
+            }
 
-    for (int i = 0; i < args.length; i++) { // parse command line
-      if (args[i].equals("-identifyfile")) {
-        command = IDFILE;
-        charset = args[++i];
-        filename = args[++i];
-      }
-
-      if (args[i].equals("-identifyrows")) {
-        command = IDROWS;
-        filename = args[++i];
-        max = Integer.parseInt(args[++i]);
-      }
-
-      if (args[i].equals("-identifytext")) {
-        command = IDTEXT;
-        for (i++; i < args.length - 1; i++)
-          text += args[i] + " ";
-      }
-
-      if (args[i].equals("-identifyfileset")) {
-        command = IDFILESET;
-        charset = args[++i];
-        for (i++; i < args.length; i++) {
-          File[] files = null;
-          File f = new File(args[i]);
-          if (f.isDirectory()) {
-              files = f.listFiles();
-          } else {
-              files = new File[] { f };
-          }
-          for (int j=0; j<files.length; j++) {
-            fileset.add(files[j].getAbsolutePath());
-          }
-        }
-      }
+            if (args[i].equals("-identifyrows")) {
+                command = IDROWS;
+                filename = args[++i];
+                max = Integer.parseInt(args[++i]);
+            }
 
-    }
+            if (args[i].equals("-identifytext")) {
+                command = IDTEXT;
+                for (i++; i < args.length - 1; i++)
+                    text += args[i] + " ";
+            }
 
-    String lang = null;
-    //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
-    LanguageIdentifier idfr = new LanguageIdentifier();
-    File f;
-    FileInputStream fis;
-    try {
-      switch (command) {
-
-        case IDTEXT:
-          lang = idfr.identify(text);
-          break;
-
-        case IDFILE:
-          f = new File(filename);
-          fis = new FileInputStream(f);
-          lang = idfr.identify(fis, charset);
-          fis.close();
-          break;
-
-        case IDROWS:
-          f = new File(filename);
-          BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
-          String line;
-          while (max > 0 && (line = br.readLine()) != null) {
-            line = line.trim();
-            if (line.length() > 2) {
-              max--;
-              lang = idfr.identify(line);
-              System.out.println("R=" + lang + ":" + line);
+            if (args[i].equals("-identifyfileset")) {
+                command = IDFILESET;
+                charset = args[++i];
+                for (i++; i < args.length; i++) {
+                    File[] files = null;
+                    File f = new File(args[i]);
+                    if (f.isDirectory()) {
+                        files = f.listFiles();
+                    } else {
+                        files = new File[] { f };
+                    }
+                    for (int j=0; j<files.length; j++) {
+                        fileset.add(files[j].getAbsolutePath());
+                    }
+                }
             }
-          }
 
-          br.close();
-          System.exit(0);
-          break;
-
-        case IDFILESET:
-          /* used for benchs
-          for (int j=128; j<=524288; j*=2) {
-            long start = System.currentTimeMillis();
-            idfr.analyzeLength = j; */
-          System.out.println("FILESET");
-          Iterator<String> i = fileset.iterator();
-          while (i.hasNext()) {
-            try {
-              filename = i.next();
-              f = new File(filename);
-              fis = new FileInputStream(f);
-              lang = idfr.identify(fis, charset);
-              fis.close();
-            } catch (Exception e) {
-              System.out.println(e);
+        }
+
+        String lang = null;
+        //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+        LanguageIdentifier idfr = new LanguageIdentifier();
+        File f;
+        FileInputStream fis;
+        try {
+            switch (command) {
+
+            case IDTEXT:
+                lang = idfr.identify(text);
+                break;
+
+            case IDFILE:
+                f = new File(filename);
+                fis = new FileInputStream(f);
+                lang = idfr.identify(fis, charset);
+                fis.close();
+                break;
+
+            case IDROWS:
+                f = new File(filename);
+                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
+                String line;
+                while (max > 0 && (line = br.readLine()) != null) {
+                    line = line.trim();
+                    if (line.length() > 2) {
+                        max--;
+                        lang = idfr.identify(line);
+                        System.out.println("R=" + lang + ":" + line);
+                    }
+                }
+
+                br.close();
+                System.exit(0);
+                break;
+
+            case IDFILESET:
+                System.out.println("FILESET");
+                Iterator<String> i = fileset.iterator();
+                while (i.hasNext()) {
+                    try {
+                        filename = i.next();
+                        f = new File(filename);
+                        fis = new FileInputStream(f);
+                        lang = idfr.identify(fis, charset);
+                        fis.close();
+                    } catch (Exception e) {
+                        System.out.println(e);
+                    }
+                    System.out.println(filename + " was identified as " + lang);
+                }
+                System.exit(0);
+                break;
             }
-            System.out.println(filename + " was identified as " + lang);
-          }
-          /* used for benchs
-            System.out.println(j + "/" + (System.currentTimeMillis()-start));
-          } */
-          System.exit(0);
-          break;
-      }
-    } catch (Exception e) {
-      System.out.println(e);
+        } catch (Exception e) {
+            System.out.println(e);
+        }
+        System.out.println("text was identified as " + lang);
     }
-    System.out.println("text was identified as " + lang);
-  }
 
-  /**
-   * Identify language of a content.
-   * 
-   * @param content is the content to analyze.
-   * @return The 2 letter
-   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
-   *         language code</a> (en, fi, sv, ...) of the language that best
-   *         matches the specified content.
-   */
-  public String identify(String content) {
-    return identify(new StringBuilder(content));
-  }
-
-  /**
-   * Identify language of a content.
-   * 
-   * @param content is the content to analyze.
-   * @return The 2 letter
-   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
-   *         language code</a> (en, fi, sv, ...) of the language that best
-   *         matches the specified content.
-   */
-  public String identify(StringBuilder content) {
-
-	StringBuilder text = content;
-    if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
-        text = new StringBuilder().append(content);
-        text.setLength(analyzeLength);
+    /**
+     * Identify language of a content.
+     * 
+     * @param content is the content to analyze.
+     * @return The 2 letter
+     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+     *         language code</a> (en, fi, sv, ...) of the language that best
+     *         matches the specified content.
+     */
+    public String identify(String content) {
+        return identify(new StringBuilder(content));
     }
 
-    suspect.analyze(text);
-    Iterator<NGramEntry> iter = suspect.getSorted().iterator();
-    float topscore = Float.MIN_VALUE;
-    String lang = "";
-    HashMap<NGramProfile, Float> scores = new HashMap<NGramProfile, Float>();
-    NGramEntry searched = null;
-    
-    while (iter.hasNext()) {
-        searched = iter.next();
-        NGramEntry[] ngrams = ngramsIdx.get(searched.getSeq());
-        if (ngrams != null) {
-            for (int j=0; j<ngrams.length; j++) {
-                NGramProfile profile = ngrams[j].getProfile();
-                Float pScore = scores.get(profile);
-                if (pScore == null) {
-                    pScore = new Float(0);
-                }
-                float plScore = pScore.floatValue();
-                plScore += ngrams[j].getFrequency() + searched.getFrequency();
-                scores.put(profile, new Float(plScore));
-                if (plScore > topscore) {
-                    topscore = plScore;
-                    lang = profile.getName();
+    /**
+     * Identify language of a content.
+     * 
+     * @param content is the content to analyze.
+     * @return The 2 letter
+     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+     *         language code</a> (en, fi, sv, ...) of the language that best
+     *         matches the specified content.
+     */
+    public String identify(StringBuilder content) {
+        StringBuilder text = content;
+        if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
+            text = new StringBuilder().append(content);
+            text.setLength(analyzeLength);
+        }
+
+        suspect.analyze(text);
+        Iterator<NGramEntry> iter = suspect.getSorted().iterator();
+        float topscore = Float.MIN_VALUE;
+        String lang = "";
+        HashMap<NGramProfile, Float> scores = new HashMap<NGramProfile, Float>();
+        NGramEntry searched = null;
+
+        while (iter.hasNext()) {
+            searched = iter.next();
+            NGramEntry[] ngrams = ngramsIdx.get(searched.getSeq());
+            if (ngrams != null) {
+                for (int j=0; j<ngrams.length; j++) {
+                    NGramProfile profile = ngrams[j].getProfile();
+                    Float pScore = scores.get(profile);
+                    if (pScore == null) {
+                        pScore = new Float(0);
+                    }
+                    float plScore = pScore.floatValue();
+                    plScore += ngrams[j].getFrequency() + searched.getFrequency();
+                    scores.put(profile, new Float(plScore));
+                    if (plScore > topscore) {
+                        topscore = plScore;
+                        lang = profile.getName();
+                    }
                 }
             }
         }
+        return lang;
+    }
+
+    /**
+     * Identify language from input stream.
+     * This method uses the platform default encoding to read the input stream.
+     * For using a specific encoding, use the
+     * {@link #identify(InputStream, String)} method.
+     *
+     * @param is is the input stream to analyze.
+     * @return The 2 letter
+     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+     *         language code</a> (en, fi, sv, ...) of the language that best
+     *         matches the content of the specified input stream.
+     * @throws IOException if something wrong occurs on the input stream.
+     */
+    public String identify(InputStream is) throws IOException {
+        return identify(is, null);
     }
-    return lang;
-  }
 
-  /**
-   * Identify language from input stream.
-   * This method uses the platform default encoding to read the input stream.
-   * For using a specific encoding, use the
-   * {@link #identify(InputStream, String)} method.
-   *
-   * @param is is the input stream to analyze.
-   * @return The 2 letter
-   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
-   *         language code</a> (en, fi, sv, ...) of the language that best
-   *         matches the content of the specified input stream.
-   * @throws IOException if something wrong occurs on the input stream.
-   */
-  public String identify(InputStream is) throws IOException {
-    return identify(is, null);
-  }
-  
-  /**
-   * Identify language from input stream.
-   * 
-   * @param is is the input stream to analyze.
-   * @param charset is the charset to use to read the input stream.
-   * @return The 2 letter
-   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
-   *         language code</a> (en, fi, sv, ...) of the language that best
-   *         matches the content of the specified input stream.
-   * @throws IOException if something wrong occurs on the input stream.
-   */
-  public String identify(InputStream is, String charset) throws IOException {
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    byte[] buffer = new byte[2048];
-    int len = 0;
-
-    while (((len = is.read(buffer)) != -1) &&
-           ((analyzeLength == 0) || (out.size() < analyzeLength))) {
-      if (analyzeLength != 0) {
-          len = Math.min(len, analyzeLength - out.size());
-      }
-      out.write(buffer, 0, len);
+    /**
+     * Identify language from input stream.
+     * 
+     * @param is is the input stream to analyze.
+     * @param charset is the charset to use to read the input stream.
+     * @return The 2 letter
+     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+     *         language code</a> (en, fi, sv, ...) of the language that best
+     *         matches the content of the specified input stream.
+     * @throws IOException if something wrong occurs on the input stream.
+     */
+    public String identify(InputStream is, String charset) throws IOException {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        byte[] buffer = new byte[2048];
+        int len = 0;
+
+        while (((len = is.read(buffer)) != -1) &&
+                ((analyzeLength == 0) || (out.size() < analyzeLength))) {
+            if (analyzeLength != 0) {
+                len = Math.min(len, analyzeLength - out.size());
+            }
+            out.write(buffer, 0, len);
+        }
+        return identify((charset == null) ? out.toString()
+                : out.toString(charset));
     }
-    return identify((charset == null) ? out.toString()
-                                      : out.toString(charset));
-  }
 
 }

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java Fri Jul 31 07:43:34 2009
@@ -16,22 +16,21 @@
  */
 package org.apache.tika.language;
 
-// JDK imports
-import java.io.File;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.OutputStream;
+import java.io.BufferedInputStream;
 import java.io.BufferedReader;
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.BufferedInputStream;
-import java.util.Date;
-import java.util.List;
-import java.util.Iterator;
+import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Date;
 import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 
 /**
@@ -47,671 +46,651 @@
  */
 public class NGramProfile {
 
-  /** The minimum length allowed for a ngram. */
-  final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
+    /** The minimum length allowed for a ngram. */
+    final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
 
-  /** The maximum length allowed for a ngram. */
-  final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
-    
-  /** The default min length of ngram */
-  final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
-
-  /** The default max length of ngram */
-  final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
-
-  /** The ngram profile file extension */
-  static final String FILE_EXTENSION = "ngp";
-
-  /** The profile max size (number of ngrams of the same size) */
-  static final int MAX_SIZE = 1000;
-
-  /** separator char */
-  static final char SEPARATOR = '_';
-  /** The String form of the separator char */  
-  private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
-
-  
-  /** The profile's name */
-  private String name = null;
-
-  /** The NGrams of this profile sorted on the number of occurences */
-  private List<NGramEntry> sorted = null;
-
-  /** The min length of ngram */
-  private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
-
-  /** The max length of ngram */
-  private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
-
-  /** The total number of ngrams occurences */
-  private int[] ngramcounts = null;
-
-  /** An index of the ngrams of the profile */
-  private Map<CharSequence, NGramEntry> ngrams = null;
-
-  /** A StringBuffer used during analysis */
-  private QuickStringBuffer word = new QuickStringBuffer();
-  
-    
-  /**
-   * Construct a new ngram profile
-   * 
-   * @param name is the name of the profile
-   * @param minlen is the min length of ngram sequences
-   * @param maxlen is the max length of ngram sequences
-   */
-  public NGramProfile(String name, int minlen, int maxlen) {
-    // TODO: Compute the initial capacity using minlen and maxlen.
-    this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
-    this.minLength = minlen;
-    this.maxLength = maxlen;
-    this.name = name;
-  }
-
-  /**
-   * @return Returns the name.
-   */
-  public String getName() {
-    return name;
-  }
-
-  /**
-   * Add ngrams from a single word to this profile
-   * 
-   * @param word is the word to add
-   */
-  public void add(StringBuffer word) {
-    for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
-      add(word, i);
-    }
-  }
-
-  /**
-   * Add the last NGrams from the specified word.
-   */
-  private void add(QuickStringBuffer word) {
-    int wlen = word.length();
-    if (wlen >= minLength) {
-        int max = Math.min(maxLength, wlen);
-        for (int i=minLength; i<=max; i++) {
-            add(word.subSequence(wlen-i, wlen));
-        }
-    }
-  }
-  
-  /**
-   * Add ngrams from a single word in this profile
-   *
-   * @param word is the word to add
-   * @param n is the ngram size
-   */
-  private void add(CharSequence cs) {
-
-    if (cs.equals(SEP_CHARSEQ)) { return; }
-    NGramEntry nge = ngrams.get(cs);
-    if (nge == null) {
-      nge = new NGramEntry(cs);
-      ngrams.put(cs, nge);
-    }
-    nge.inc();
-  }
-
-  /**
-   * Analyze a piece of text
-   * 
-   * @param text the text to be analyzed
-   */
-  public void analyze(StringBuilder text) {
-
-    if (ngrams != null) {
-      ngrams.clear();
-      sorted = null;
-      ngramcounts = null;
-    }
-
-    word.clear().append(SEPARATOR);
-    for (int i = 0; i < text.length(); i++) {
-      char c = Character.toLowerCase(text.charAt(i));
-
-      if (Character.isLetter(c)) {
-        add(word.append(c));
-      } else {
-        //found word boundary
-        if (word.length() > 1) {
-          //we have a word!
-          add(word.append(SEPARATOR));
-          word.clear().append(SEPARATOR);
-        }
-      }
-    }
-
-    if (word.length() > 1) {
-      //we have a word!
-      add(word.append(SEPARATOR));
-    }
-    normalize();
-  }
-
-  /**
-   * @param word
-   * @param n sequence length
-   */
-  private void add(StringBuffer word, int n) {
-    for (int i=0; i <= word.length()-n; i++) {
-      add(word.subSequence(i, i + n));
-    }
-  }
-    
-  /**
-   * Normalize the profile (calculates the ngrams frequencies)
-   */
-  protected void normalize() {
-
-    NGramEntry e = null;
-    //List sorted = getSorted();
-    Iterator<NGramEntry> i = ngrams.values().iterator();
-
-    // Calculate ngramcount if not already done
-    if (ngramcounts == null) {
-      ngramcounts = new int[maxLength+1];
-      while (i.hasNext()) {
-        e = i.next();
-        ngramcounts[e.size()] += e.count;
-      }
-    }
-    
-    i = ngrams.values().iterator();
-    while (i.hasNext()) {
-      e = i.next();
-      e.frequency = (float) e.count / (float) ngramcounts[e.size()];
-    }
-  }
-
-  /**
-   * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
-   * 
-   * @return sorted vector of ngrams
-   */
-  public List<NGramEntry> getSorted() {
-    // make sure sorting is done only once
-    if (sorted == null) {
-      sorted = new ArrayList<NGramEntry>(ngrams.values());
-      Collections.sort(sorted);
-
-      // trim at NGRAM_LENGTH entries
-      if (sorted.size() > MAX_SIZE) {
-        sorted = sorted.subList(0, MAX_SIZE);
-      }
-    }
-    return sorted;
-  }
-  
-  // Inherited JavaDoc
-  public String toString() {
-
-    StringBuffer s = new StringBuffer().append("NGramProfile: ")
-                                       .append(name).append("\n");
-
-    Iterator<NGramEntry> i = getSorted().iterator();
-
-    while (i.hasNext()) {
-      NGramEntry entry = i.next();
-      s.append("[").append(entry.seq)
-       .append("/").append(entry.count)
-       .append("/").append(entry.frequency).append("]\n");
-    }
-    return s.toString();
-  }
-
-  /**
-   * Calculate a score how well NGramProfiles match each other
-   * 
-   * @param another
-   *          ngram profile to compare against
-   * @return similarity 0=exact match
-   */
-  public float getSimilarity(NGramProfile another) {
-      
-    float sum = 0;
-
-    try {
-      Iterator<NGramEntry> i = another.getSorted().iterator();
-      while (i.hasNext()) {
-        NGramEntry other = i.next();
-        if (ngrams.containsKey(other.seq)) {
-          sum += Math.abs((other.frequency -
-                          ngrams.get(other.seq).frequency)) / 2;
-        } else {
-          sum += other.frequency;
-        }
-      }
-      i = getSorted().iterator();
-      while (i.hasNext()) {
-        NGramEntry other = i.next();
-        if (another.ngrams.containsKey(other.seq)) {
-          sum += Math.abs((other.frequency -
-                          another.ngrams.get(other.seq).frequency)) / 2;
-        } else {
-          sum += other.frequency;
-        }
-      }
-    } catch (Exception e) {
-      // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
-    }
-    return sum;
-  }
-
-  /**
-   * Loads a ngram profile from an InputStream
-   * (assumes UTF-8 encoded content)
-   * @param is the InputStream to read
-   */
-  public void load(InputStream is) throws IOException {
-
-    ngrams.clear();
-    ngramcounts = new int[maxLength+1];
-    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
-    String line = null;
-
-    while ((line = reader.readLine()) != null) {
-
-      // # starts a comment line
-      if (line.charAt(0) != '#') {
-        int spacepos = line.indexOf(' ');
-        String ngramsequence = line.substring(0, spacepos).trim();
-        int len = ngramsequence.length();
-        if ((len >= minLength) && (len <= maxLength)) {
-            int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
-            NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
-            ngrams.put(en.getSeq(), en);
-            ngramcounts[len] += ngramcount;
-        }
-      }
-    }
-    normalize();
-  }
-
-  /**
-   * Create a new Language profile from (preferably quite large) text file
-   * 
-   * @param name is thename of profile
-   * @param is is the stream to read
-   * @param encoding is the encoding of stream
-   */
-  public static NGramProfile create(String name, InputStream is, String encoding) {
-
-    NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
-                                                     ABSOLUTE_MAX_NGRAM_LENGTH);
-    BufferedInputStream bis = new BufferedInputStream(is);
-
-    byte buffer[] = new byte[4096];
-    StringBuilder text = new StringBuilder();
-    int len;
-
-    try {
-      while ((len = bis.read(buffer)) != -1) {
-        text.append(new String(buffer, 0, len, encoding));
-      }
-    } catch (IOException e) {
-      // e.printStackTrace(LogUtil.getWarnStream(LOG));
-    }
-
-    newProfile.analyze(text);
-    return newProfile;
-  }
-
-  /**
-   * Writes NGramProfile content into OutputStream, content is outputted with
-   * UTF-8 encoding
-   * 
-   * @param os the Stream to output to
-   * @throws IOException
-   */
-  public void save(OutputStream os) throws IOException {
-
-    // Write header
-    os.write(("# NgramProfile generated at " + new Date() +
-              " for Nutch Language Identification\n").getBytes());
-
-    // And then each ngram
-    
-    // First dispatch ngrams in many lists depending on their size
-    // (one list for each size, in order to store MAX_SIZE ngrams for each
-    // size of ngram)
-    List<NGramEntry> list = new ArrayList<NGramEntry>();
-    List<NGramEntry> sublist = new ArrayList<NGramEntry>();
-    NGramEntry[] entries = ngrams.values().toArray(new NGramEntry[ngrams.size()]);
-    for (int i=minLength; i<=maxLength; i++) {
-      for (int j=0; j<entries.length; j++) {
-        if (entries[j].getSeq().length() == i) {
-          sublist.add(entries[j]);
-        }
-      }
-      Collections.sort(sublist);
-      if (sublist.size() > MAX_SIZE) {
-        sublist = sublist.subList(0, MAX_SIZE);
-      }
-      list.addAll(sublist);
-      sublist.clear();
-    }
-    for (int i=0; i<list.size(); i++) {
-      NGramEntry e = list.get(i);
-      String line = e.toString() + " " + e.getCount() + "\n";
-      os.write(line.getBytes("UTF-8"));
-    }
-    os.flush();
-  }
-
-  /**
-   * main method used for testing only
-   * 
-   * @param args
-   */
-  public static void main(String args[]) {
-
-    String usage = "Usage: NGramProfile " +
-                   "[-create profilename filename encoding] " +
-                   "[-similarity file1 file2] "+
-                   "[-score profile-name filename encoding]";
-    int command = 0;
-
-    final int CREATE = 1;
-    final int SIMILARITY = 2;
-    final int SCORE = 3;
-
-    String profilename = "";
-    String filename = "";
-    String filename2 = "";
-    String encoding = "";
-    
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
-
-    for (int i = 0; i < args.length; i++) { // parse command line
-      if (args[i].equals("-create")) { // found -create option
-        command = CREATE;
-        profilename = args[++i];
-        filename = args[++i];
-        encoding = args[++i];
-      }
-
-      if (args[i].equals("-similarity")) { // found -similarity option
-        command = SIMILARITY;
-        filename = args[++i];
-        filename2 = args[++i];
-        encoding = args[++i];
-      }
-
-      if (args[i].equals("-score")) { // found -Score option
-        command = SCORE;
-        profilename = args[++i];
-        filename = args[++i];
-        encoding = args[++i];
-      }
-    }
-
-    try {
-
-      switch (command) {
-
-      case CREATE:
-
-        File f = new File(filename);
-        FileInputStream fis = new FileInputStream(f);
-        NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
-        fis.close();
-        f = new File(profilename + "." + FILE_EXTENSION);
-        FileOutputStream fos = new FileOutputStream(f);
-        newProfile.save(fos);
-        System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
-        break;
-
-      case SIMILARITY:
-
-        f = new File(filename);
-        fis = new FileInputStream(f);
-        newProfile = NGramProfile.create(filename, fis, encoding);
-        newProfile.normalize();
-
-        f = new File(filename2);
-        fis = new FileInputStream(f);
-        NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
-        newProfile2.normalize();
-        System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
-        break;
-
-      case SCORE:
-        f = new File(filename);
-        fis = new FileInputStream(f);
-        newProfile = NGramProfile.create(filename, fis, encoding);
-
-        f = new File(profilename + "." + FILE_EXTENSION);
-        fis = new FileInputStream(f);
-        NGramProfile compare = new NGramProfile(profilename,
-                                                DEFAULT_MIN_NGRAM_LENGTH,
-                                                DEFAULT_MAX_NGRAM_LENGTH);
-        compare.load(fis);
-        System.out.println("Score is " + compare.getSimilarity(newProfile));
-        break;
-
-      }
-
-    } catch (Exception e) {
-    }
-  }
-
-  
-  /**
-   * Inner class that describes a NGram
-   */
-  class NGramEntry implements Comparable<NGramEntry> {
-
-    /** The NGRamProfile this NGram is related to */
-    private NGramProfile profile = null;
-
-    /** The sequence of characters of the ngram */
-    CharSequence seq = null;
-
-    /** The number of occurences of this ngram in its profile */
-    private int count = 0;
-
-    /** The frequency of this ngram in its profile */
-    private float frequency = 0.0F;
-
-    
-    /** 
-     * Constructs a new NGramEntry
-     * @param seq is the sequence of characters of the ngram
-     */
-    public NGramEntry(CharSequence seq) {
-      this.seq = seq;
-    }
-
-    /** 
-     * Constructs a new NGramEntry
-     * @param seq is the sequence of characters of the ngram
-     * @param count is the number of occurences of this ngram
-     */
-    public NGramEntry(String seq, int count) {
-      this.seq = new StringBuffer(seq).subSequence(0, seq.length());
-      this.count = count;
-    }
-
-    
-    /**
-     * Returns the number of occurences of this ngram in its profile
-     * @return the number of occurences of this ngram in its profile
-     */
-    public int getCount() {
-      return count;
-    }
-    
-    /**
-     * Returns the frequency of this ngram in its profile
-     * @return the frequency of this ngram in its profile
-     */
-    public float getFrequency() {
-        return frequency;
-    }
-
-    /**
-     * Returns the sequence of characters of this ngram
-     * @return the sequence of characters of this ngram
-     */
-    public CharSequence getSeq() {
-      return seq;
+    /** The maximum length allowed for a ngram. */
+    final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
+
+    /** The default min length of ngram */
+    final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
+
+    /** The default max length of ngram */
+    final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
+
+    /** The ngram profile file extension */
+    static final String FILE_EXTENSION = "ngp";
+
+    /** The profile max size (number of ngrams of the same size) */
+    static final int MAX_SIZE = 1000;
+
+    /** separator char */
+    static final char SEPARATOR = '_';
+    /** The String form of the separator char */  
+    private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
+
+
+    /** The profile's name */
+    private String name = null;
+
+    /** The NGrams of this profile sorted on the number of occurences */
+    private List<NGramEntry> sorted = null;
+
+    /** The min length of ngram */
+    private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
+
+    /** The max length of ngram */
+    private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
+
+    /** The total number of ngrams occurences */
+    private int[] ngramcounts = null;
+
+    /** An index of the ngrams of the profile */
+    private Map<CharSequence, NGramEntry> ngrams = null;
+
+    /** A StringBuffer used during analysis */
+    private QuickStringBuffer word = new QuickStringBuffer();
+
+
+    /**
+     * Construct a new ngram profile
+     * 
+     * @param name is the name of the profile
+     * @param minlen is the min length of ngram sequences
+     * @param maxlen is the max length of ngram sequences
+     */
+    public NGramProfile(String name, int minlen, int maxlen) {
+        // TODO: Compute the initial capacity using minlen and maxlen.
+        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
+        this.minLength = minlen;
+        this.maxLength = maxlen;
+        this.name = name;
     }
 
     /**
-     * Returns the size of this ngram
-     * @return the size of this ngram
+     * @return Returns the name.
      */
-    public int size() {
-        return seq.length();
+    public String getName() {
+        return name;
     }
-    
-    // Inherited JavaDoc
-    public int compareTo(NGramEntry ngram) {
-      int diff = Float.compare(ngram.getFrequency(), frequency);
-      if (diff != 0) {
-        return diff;
-      } else {
-        return (toString().compareTo(ngram.toString()));
-      }
+
+    /**
+     * Add ngrams from a single word to this profile
+     * 
+     * @param word is the word to add
+     */
+    public void add(StringBuffer word) {
+        for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
+            add(word, i);
+        }
     }
 
     /**
-     * Increments the number of occurences of this ngram.
+     * Add the last NGrams from the specified word.
      */
-    public void inc() {
-      count++;
+    private void add(QuickStringBuffer word) {
+        int wlen = word.length();
+        if (wlen >= minLength) {
+            int max = Math.min(maxLength, wlen);
+            for (int i=minLength; i<=max; i++) {
+                add(word.subSequence(wlen-i, wlen));
+            }
+        }
     }
 
     /**
-     * Associated a profile to this ngram
-     * @param profile is the profile associated to this ngram
+     * Add ngrams from a single word in this profile
+     *
+     * @param word is the word to add
+     * @param n is the ngram size
      */
-    public void setProfile(NGramProfile profile) {
-        this.profile = profile;
+    private void add(CharSequence cs) {
+        if (cs.equals(SEP_CHARSEQ)) { return; }
+        NGramEntry nge = ngrams.get(cs);
+        if (nge == null) {
+            nge = new NGramEntry(cs);
+            ngrams.put(cs, nge);
+        }
+        nge.inc();
     }
 
     /**
-     * Returns the profile associated to this ngram
-     * @return the profile associated to this ngram
+     * Analyze a piece of text
+     * 
+     * @param text the text to be analyzed
      */
-    public NGramProfile getProfile() {
-        return profile;
+    public void analyze(StringBuilder text) {
+        if (ngrams != null) {
+            ngrams.clear();
+            sorted = null;
+            ngramcounts = null;
+        }
+
+        word.clear().append(SEPARATOR);
+        for (int i = 0; i < text.length(); i++) {
+            char c = Character.toLowerCase(text.charAt(i));
+
+            if (Character.isLetter(c)) {
+                add(word.append(c));
+            } else {
+                //found word boundary
+                if (word.length() > 1) {
+                    //we have a word!
+                    add(word.append(SEPARATOR));
+                    word.clear().append(SEPARATOR);
+                }
+            }
+        }
+
+        if (word.length() > 1) {
+            //we have a word!
+            add(word.append(SEPARATOR));
+        }
+        normalize();
     }
 
-    // Inherited JavaDoc
-    public String toString() {
-        return seq.toString();
+    /**
+     * @param word
+     * @param n sequence length
+     */
+    private void add(StringBuffer word, int n) {
+        for (int i=0; i <= word.length()-n; i++) {
+            add(word.subSequence(i, i + n));
+        }
     }
 
-    // Inherited JavaDoc
-    public int hashCode() {
-        return seq.hashCode();
+    /**
+     * Normalize the profile (calculates the ngrams frequencies)
+     */
+    protected void normalize() {
+        NGramEntry e = null;
+        //List sorted = getSorted();
+        Iterator<NGramEntry> i = ngrams.values().iterator();
+
+        // Calculate ngramcount if not already done
+        if (ngramcounts == null) {
+            ngramcounts = new int[maxLength+1];
+            while (i.hasNext()) {
+                e = i.next();
+                ngramcounts[e.size()] += e.count;
+            }
+        }
+
+        i = ngrams.values().iterator();
+        while (i.hasNext()) {
+            e = i.next();
+            e.frequency = (float) e.count / (float) ngramcounts[e.size()];
+        }
     }
-    
-    // Inherited JavaDoc
-    public boolean equals(Object obj) {
-        
-        NGramEntry ngram = null;
-        try {
-            ngram = (NGramEntry) obj;
-            return ngram.seq.equals(seq);
-        } catch (Exception e) {
-            return false;
+
+    /**
+     * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
+     * 
+     * @return sorted vector of ngrams
+     */
+    public List<NGramEntry> getSorted() {
+        // make sure sorting is done only once
+        if (sorted == null) {
+            sorted = new ArrayList<NGramEntry>(ngrams.values());
+            Collections.sort(sorted);
+
+            // trim at NGRAM_LENGTH entries
+            if (sorted.size() > MAX_SIZE) {
+                sorted = sorted.subList(0, MAX_SIZE);
+            }
         }
+        return sorted;
     }
 
-  }
+    // Inherited JavaDoc
+    public String toString() {
+        StringBuffer s = new StringBuffer().append("NGramProfile: ")
+        .append(name).append("\n");
 
-  
-  private class QuickStringBuffer implements CharSequence {
+        Iterator<NGramEntry> i = getSorted().iterator();
 
-    private char value[];
+        while (i.hasNext()) {
+            NGramEntry entry = i.next();
+            s.append("[").append(entry.seq)
+            .append("/").append(entry.count)
+            .append("/").append(entry.frequency).append("]\n");
+        }
+        return s.toString();
+    }
 
-    private int count;
+    /**
+     * Calculate a score how well NGramProfiles match each other
+     * 
+     * @param another
+     *          ngram profile to compare against
+     * @return similarity 0=exact match
+     */
+    public float getSimilarity(NGramProfile another) {
+        float sum = 0;
 
-    QuickStringBuffer() {
-      this(16);
+        try {
+            Iterator<NGramEntry> i = another.getSorted().iterator();
+            while (i.hasNext()) {
+                NGramEntry other = i.next();
+                if (ngrams.containsKey(other.seq)) {
+                    sum += Math.abs((other.frequency -
+                            ngrams.get(other.seq).frequency)) / 2;
+                } else {
+                    sum += other.frequency;
+                }
+            }
+            i = getSorted().iterator();
+            while (i.hasNext()) {
+                NGramEntry other = i.next();
+                if (another.ngrams.containsKey(other.seq)) {
+                    sum += Math.abs((other.frequency -
+                            another.ngrams.get(other.seq).frequency)) / 2;
+                } else {
+                    sum += other.frequency;
+                }
+            }
+        } catch (Exception e) {
+            // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
+        }
+        return sum;
     }
 
-    QuickStringBuffer(char[] value) {
-      this.value = value;
-      count = value.length;
-    }
-    
-    QuickStringBuffer(int length) {
-      value = new char[length];
+    /**
+     * Loads a ngram profile from an InputStream
+     * (assumes UTF-8 encoded content)
+     * @param is the InputStream to read
+     */
+    public void load(InputStream is) throws IOException {
+        ngrams.clear();
+        ngramcounts = new int[maxLength+1];
+        BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+        String line = null;
+
+        while ((line = reader.readLine()) != null) {
+
+            // # starts a comment line
+            if (line.charAt(0) != '#') {
+                int spacepos = line.indexOf(' ');
+                String ngramsequence = line.substring(0, spacepos).trim();
+                int len = ngramsequence.length();
+                if ((len >= minLength) && (len <= maxLength)) {
+                    int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+                    NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+                    ngrams.put(en.getSeq(), en);
+                    ngramcounts[len] += ngramcount;
+                }
+            }
+        }
+        normalize();
     }
 
-    QuickStringBuffer(String str) {
-      this(str.length() + 16);
-      append(str);
-    }
+    /**
+     * Create a new Language profile from (preferably quite large) text file
+     * 
+     * @param name is thename of profile
+     * @param is is the stream to read
+     * @param encoding is the encoding of stream
+     */
+    public static NGramProfile create(String name, InputStream is, String encoding) {
+        NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
+                ABSOLUTE_MAX_NGRAM_LENGTH);
+        BufferedInputStream bis = new BufferedInputStream(is);
+
+        byte buffer[] = new byte[4096];
+        StringBuilder text = new StringBuilder();
+        int len;
 
-    public int length() {
-      return count;
-    }
+        try {
+            while ((len = bis.read(buffer)) != -1) {
+                text.append(new String(buffer, 0, len, encoding));
+            }
+        } catch (IOException e) {
+            // e.printStackTrace(LogUtil.getWarnStream(LOG));
+        }
 
-    private void expandCapacity(int minimumCapacity) {
-      int newCapacity = (value.length + 1) * 2;
-      if (newCapacity < 0) {
-        newCapacity = Integer.MAX_VALUE;
-      } else if (minimumCapacity > newCapacity) {
-          newCapacity = minimumCapacity;
-      }
-	
-      char newValue[] = new char[newCapacity];
-      System.arraycopy(value, 0, newValue, 0, count);
-      value = newValue;
+        newProfile.analyze(text);
+        return newProfile;
     }
 
-    QuickStringBuffer clear() {
-      count = 0;
-      return this;
+    /**
+     * Writes NGramProfile content into OutputStream, content is outputted with
+     * UTF-8 encoding
+     * 
+     * @param os the Stream to output to
+     * @throws IOException
+     */
+    public void save(OutputStream os) throws IOException {
+        // Write header
+        os.write(("# NgramProfile generated at " + new Date() +
+        " for Nutch Language Identification\n").getBytes());
+
+        // And then each ngram
+
+        // First dispatch ngrams in many lists depending on their size
+        // (one list for each size, in order to store MAX_SIZE ngrams for each
+        // size of ngram)
+        List<NGramEntry> list = new ArrayList<NGramEntry>();
+        List<NGramEntry> sublist = new ArrayList<NGramEntry>();
+        NGramEntry[] entries = ngrams.values().toArray(new NGramEntry[ngrams.size()]);
+        for (int i=minLength; i<=maxLength; i++) {
+            for (int j=0; j<entries.length; j++) {
+                if (entries[j].getSeq().length() == i) {
+                    sublist.add(entries[j]);
+                }
+            }
+            Collections.sort(sublist);
+            if (sublist.size() > MAX_SIZE) {
+                sublist = sublist.subList(0, MAX_SIZE);
+            }
+            list.addAll(sublist);
+            sublist.clear();
+        }
+        for (int i=0; i<list.size(); i++) {
+            NGramEntry e = list.get(i);
+            String line = e.toString() + " " + e.getCount() + "\n";
+            os.write(line.getBytes("UTF-8"));
+        }
+        os.flush();
     }
 
-    public char charAt(int index) {
-      return value[index];
-    }
+    /**
+     * main method used for testing only
+     * 
+     * @param args
+     */
+    public static void main(String args[]) {
+        String usage = "Usage: NGramProfile"
+            + " [-create profilename filename encoding]"
+            + " [-similarity file1 file2]"
+            + " [-score profile-name filename encoding]";
+        int command = 0;
+
+        final int CREATE = 1;
+        final int SIMILARITY = 2;
+        final int SCORE = 3;
+
+        String profilename = "";
+        String filename = "";
+        String filename2 = "";
+        String encoding = "";
+
+        if (args.length == 0) {
+            System.err.println(usage);
+            System.exit(-1);
+        }
 
-    QuickStringBuffer append(String str) {
-      if (str == null) {
-        str = String.valueOf(str);
-      }
+        for (int i = 0; i < args.length; i++) { // parse command line
+            if (args[i].equals("-create")) { // found -create option
+                command = CREATE;
+                profilename = args[++i];
+                filename = args[++i];
+                encoding = args[++i];
+            }
+
+            if (args[i].equals("-similarity")) { // found -similarity option
+                command = SIMILARITY;
+                filename = args[++i];
+                filename2 = args[++i];
+                encoding = args[++i];
+            }
+
+            if (args[i].equals("-score")) { // found -Score option
+                command = SCORE;
+                profilename = args[++i];
+                filename = args[++i];
+                encoding = args[++i];
+            }
+        }
 
-      int len = str.length();
-      int newcount = count + len;
-      if (newcount > value.length) {
-        expandCapacity(newcount);
-      }
-      str.getChars(0, len, value, count);
-      count = newcount;
-      return this;
+        try {
+            switch (command) {
+            case CREATE:
+                File f = new File(filename);
+                FileInputStream fis = new FileInputStream(f);
+                NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
+                fis.close();
+                f = new File(profilename + "." + FILE_EXTENSION);
+                FileOutputStream fos = new FileOutputStream(f);
+                newProfile.save(fos);
+                System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
+                break;
+
+            case SIMILARITY:
+                f = new File(filename);
+                fis = new FileInputStream(f);
+                newProfile = NGramProfile.create(filename, fis, encoding);
+                newProfile.normalize();
+
+                f = new File(filename2);
+                fis = new FileInputStream(f);
+                NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
+                newProfile2.normalize();
+                System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
+                break;
+
+            case SCORE:
+                f = new File(filename);
+                fis = new FileInputStream(f);
+                newProfile = NGramProfile.create(filename, fis, encoding);
+
+                f = new File(profilename + "." + FILE_EXTENSION);
+                fis = new FileInputStream(f);
+                NGramProfile compare = new NGramProfile(profilename,
+                        DEFAULT_MIN_NGRAM_LENGTH,
+                        DEFAULT_MAX_NGRAM_LENGTH);
+                compare.load(fis);
+                System.out.println("Score is " + compare.getSimilarity(newProfile));
+                break;
+            }
+        } catch (Exception e) {
+        }
     }
 
-    QuickStringBuffer append(char c) {
-      int newcount = count + 1;
-      if (newcount > value.length) {
-        expandCapacity(newcount);
-      }
-      value[count++] = c;
-      return this;
-    }
+    /**
+     * Inner class that describes a NGram
+     */
+    class NGramEntry implements Comparable<NGramEntry> {
+
+        /** The NGRamProfile this NGram is related to */
+        private NGramProfile profile = null;
+
+        /** The sequence of characters of the ngram */
+        CharSequence seq = null;
+
+        /** The number of occurences of this ngram in its profile */
+        private int count = 0;
+
+        /** The frequency of this ngram in its profile */
+        private float frequency = 0.0F;
+
+        /** 
+         * Constructs a new NGramEntry
+         * @param seq is the sequence of characters of the ngram
+         */
+        public NGramEntry(CharSequence seq) {
+            this.seq = seq;
+        }
+
+        /** 
+         * Constructs a new NGramEntry
+         * @param seq is the sequence of characters of the ngram
+         * @param count is the number of occurences of this ngram
+         */
+        public NGramEntry(String seq, int count) {
+            this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+            this.count = count;
+        }
+
+        /**
+         * Returns the number of occurences of this ngram in its profile
+         * @return the number of occurences of this ngram in its profile
+         */
+        public int getCount() {
+            return count;
+        }
+
+        /**
+         * Returns the frequency of this ngram in its profile
+         * @return the frequency of this ngram in its profile
+         */
+        public float getFrequency() {
+            return frequency;
+        }
+
+        /**
+         * Returns the sequence of characters of this ngram
+         * @return the sequence of characters of this ngram
+         */
+        public CharSequence getSeq() {
+            return seq;
+        }
+
+        /**
+         * Returns the size of this ngram
+         * @return the size of this ngram
+         */
+        public int size() {
+            return seq.length();
+        }
+
+        // Inherited JavaDoc
+        public int compareTo(NGramEntry ngram) {
+            int diff = Float.compare(ngram.getFrequency(), frequency);
+            if (diff != 0) {
+                return diff;
+            } else {
+                return (toString().compareTo(ngram.toString()));
+            }
+        }
+
+        /**
+         * Increments the number of occurences of this ngram.
+         */
+        public void inc() {
+            count++;
+        }
+
+        /**
+         * Associated a profile to this ngram
+         * @param profile is the profile associated to this ngram
+         */
+        public void setProfile(NGramProfile profile) {
+            this.profile = profile;
+        }
+
+        /**
+         * Returns the profile associated to this ngram
+         * @return the profile associated to this ngram
+         */
+        public NGramProfile getProfile() {
+            return profile;
+        }
+
+        // Inherited JavaDoc
+        public String toString() {
+            return seq.toString();
+        }
+
+        // Inherited JavaDoc
+        public int hashCode() {
+            return seq.hashCode();
+        }
+
+        // Inherited JavaDoc
+        public boolean equals(Object obj) {
+
+            NGramEntry ngram = null;
+            try {
+                ngram = (NGramEntry) obj;
+                return ngram.seq.equals(seq);
+            } catch (Exception e) {
+                return false;
+            }
+        }
 
-    public CharSequence subSequence(int start, int end) {
-      return new String(value, start, end - start);
     }
-        
-    public String toString() {
-      return new String(this.value);
+
+    private class QuickStringBuffer implements CharSequence {
+
+        private char value[];
+
+        private int count;
+
+        QuickStringBuffer() {
+            this(16);
+        }
+
+        QuickStringBuffer(char[] value) {
+            this.value = value;
+            count = value.length;
+        }
+
+        QuickStringBuffer(int length) {
+            value = new char[length];
+        }
+
+        QuickStringBuffer(String str) {
+            this(str.length() + 16);
+            append(str);
+        }
+
+        public int length() {
+            return count;
+        }
+
+        private void expandCapacity(int minimumCapacity) {
+            int newCapacity = (value.length + 1) * 2;
+            if (newCapacity < 0) {
+                newCapacity = Integer.MAX_VALUE;
+            } else if (minimumCapacity > newCapacity) {
+                newCapacity = minimumCapacity;
+            }
+
+            char newValue[] = new char[newCapacity];
+            System.arraycopy(value, 0, newValue, 0, count);
+            value = newValue;
+        }
+
+        QuickStringBuffer clear() {
+            count = 0;
+            return this;
+        }
+
+        public char charAt(int index) {
+            return value[index];
+        }
+
+        QuickStringBuffer append(String str) {
+            if (str == null) {
+                str = String.valueOf(str);
+            }
+
+            int len = str.length();
+            int newcount = count + len;
+            if (newcount > value.length) {
+                expandCapacity(newcount);
+            }
+            str.getChars(0, len, value, count);
+            count = newcount;
+            return this;
+        }
+
+        QuickStringBuffer append(char c) {
+            int newcount = count + 1;
+            if (newcount > value.length) {
+                expandCapacity(newcount);
+            }
+            value[count++] = c;
+            return this;
+        }
+
+        public CharSequence subSequence(int start, int end) {
+            return new String(value, start, end - start);
+        }
+
+        public String toString() {
+            return new String(this.value);
+        }
     }
-  }
-  
-  
+
 }

Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java Fri Jul 31 07:43:34 2009
@@ -16,22 +16,20 @@
  */
 package org.apache.tika.language;
 
-// JDK imports
-import java.io.InputStream;
 import java.io.BufferedReader;
-import java.io.InputStreamReader;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.util.List;
 
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
-// JUnit imports
 import junit.framework.Test;
 import junit.framework.TestCase;
 import junit.framework.TestSuite;
 import junit.textui.TestRunner;
 
+import org.apache.tika.language.NGramProfile.NGramEntry;
+
 /**
  * JUnit based test of class {@link LanguageIdentifier}.
  *
@@ -39,8 +37,7 @@
  * @author Jerome Charron - http://frutch.free.fr/
  */
 public class TestLanguageIdentifier extends TestCase {
-    
-    
+
     public TestLanguageIdentifier(String testName) {
         super(testName);
     }
@@ -48,144 +45,141 @@
     public static Test suite() {
         return new TestSuite(TestLanguageIdentifier.class);
     }
-    
+
     public static void main(String[] args) {
         TestRunner.run(suite());
     }
 
-  String tokencontent1 = "testaddtoken";
-  String tokencontent2 = "anotherteststring";
+    String tokencontent1 = "testaddtoken";
+    String tokencontent2 = "anotherteststring";
+
+    int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
 
-  int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
+    String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
 
-  String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
+    /**
+     * Test analyze method
+     */
+    public void testAnalyze() {
+        String tokencontent = "testmeagain";
 
-  /**
-   * Test analyze method
-   */
-  public void testAnalyze() {
-    String tokencontent = "testmeagain";
+        NGramProfile p = new NGramProfile("test", 1, 1);
+        p.analyze(new StringBuilder(tokencontent));
 
-    NGramProfile p = new NGramProfile("test", 1, 1);
-    p.analyze(new StringBuilder(tokencontent));
+        //test that profile size is ok, eg 9 different NGramEntries "tesmagin"
+        assertEquals(8, p.getSorted().size());
+    }
 
-    //test that profile size is ok, eg 9 different NGramEntries "tesmagin"
-    assertEquals(8, p.getSorted().size());
-  }
+    /**
+     * Test addNGrams method with StringBuffer argument
+     *
+     */
+    public void testAddNGramsStringBuffer() {
+        String tokencontent = "testmeagain";
 
-  /**
-   * Test addNGrams method with StringBuffer argument
-   *
-   */
-  public void testAddNGramsStringBuffer() {
-    String tokencontent = "testmeagain";
+        NGramProfile p = new NGramProfile("test", 1, 1);
+        p.add(new StringBuffer(tokencontent));
 
-    NGramProfile p = new NGramProfile("test", 1, 1);
-    p.add(new StringBuffer(tokencontent));
+        //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
+        assertEquals(8, p.getSorted().size());
 
-    //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
-    assertEquals(8, p.getSorted().size());
+    }
 
-  }
+    /**
+     * test getSorted method
+     */
+    public void testGetSorted() {
+        int[] count = { 4, 3, 1 };
+        String[] ngram = { "a", "b", "c" };
 
-  /**
-   * test getSorted method
-   */
-  public void testGetSorted() {
-    int[] count = { 4, 3, 1 };
-    String[] ngram = { "a", "b", "c" };
+        String teststring = "AAaaBbbC";
 
-    String teststring = "AAaaBbbC";
+        NGramProfile p = new NGramProfile("test", 1, 1);
+        p.analyze(new StringBuilder(teststring));
 
-    NGramProfile p = new NGramProfile("test", 1, 1);
-    p.analyze(new StringBuilder(teststring));
+        //test size of profile
+        assertEquals(3, p.getSorted().size());
 
-    //test size of profile
-    assertEquals(3, p.getSorted().size());
+        testCounts(p.getSorted(), count);
+        testContents(p.getSorted(), ngram);
+    }
 
-    testCounts(p.getSorted(), count);
-    testContents(p.getSorted(), ngram);
+    public void testGetSimilarity() {
+        NGramProfile a = new NGramProfile("a", 1, 1);
+        NGramProfile b = new NGramProfile("b", 1, 1);
 
-  }
+        a.analyze(new StringBuilder(tokencontent1));
+        b.analyze(new StringBuilder(tokencontent2));
 
-  public void testGetSimilarity() {
-    NGramProfile a = new NGramProfile("a", 1, 1);
-    NGramProfile b = new NGramProfile("b", 1, 1);
-    
-    a.analyze(new StringBuilder(tokencontent1));
-    b.analyze(new StringBuilder(tokencontent2));
+        //because of rounding errors might slightly return different results
+        assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
+    }
 
-    //because of rounding errors might slightly return different results
-    assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
+    public void testExactMatch() {
+        NGramProfile a = new NGramProfile("a", 1, 1);
 
-  }
+        a.analyze(new StringBuilder(tokencontent1));
 
-  public void testExactMatch() {
-    NGramProfile a = new NGramProfile("a", 1, 1);
-    
-    a.analyze(new StringBuilder(tokencontent1));
+        assertEquals(a.getSimilarity(a), 0, 0);
+    }
 
-    assertEquals(a.getSimilarity(a), 0, 0);
 
-  }
+    public void testIO() {
+        //Create profile and set some contents
+        NGramProfile a = new NGramProfile("a", 1, 1);
+        a.analyze(new StringBuilder(this.tokencontent1));
 
-  
-  public void testIO() {
-    //Create profile and set some contents
-    NGramProfile a = new NGramProfile("a", 1, 1);
-    a.analyze(new StringBuilder(this.tokencontent1));
+        NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
 
-    NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
+        //save profile
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
 
-    //save profile
-    ByteArrayOutputStream os = new ByteArrayOutputStream();
+        try {
+            a.save(os);
+            os.close();
+        } catch (Exception e) {
+            fail();
+        }
 
-    try {
-      a.save(os);
-      os.close();
-    } catch (Exception e) {
-      fail();
-    }
+        //load profile
+        InputStream is = new ByteArrayInputStream(os.toByteArray());
+        try {
+            b.load(is);
+            is.close();
+        } catch (Exception e) {
+            fail();
+        }
 
-    //load profile
-    InputStream is = new ByteArrayInputStream(os.toByteArray());
-    try {
-      b.load(is);
-      is.close();
-    } catch (Exception e) {
-      fail();
+        //check it
+        testCounts(b.getSorted(), counts1);
+        testContents(b.getSorted(), chars1);
     }
 
-    //check it
-    testCounts(b.getSorted(), counts1);
-    testContents(b.getSorted(), chars1);
-  }
+    private void testContents(List<NGramEntry> entries, String contents[]) {
+        int c = 0;
 
-  private void testContents(List<NGramEntry> entries, String contents[]) {
-    int c = 0;
-
-    for (NGramEntry nge : entries) {
-      assertEquals(contents[c], nge.getSeq().toString());
-      c++;
+        for (NGramEntry nge : entries) {
+            assertEquals(contents[c], nge.getSeq().toString());
+            c++;
+        }
     }
-  }
 
-  private void testCounts(List<NGramEntry> entries, int counts[]) {
-    int c = 0;
+    private void testCounts(List<NGramEntry> entries, int counts[]) {
+        int c = 0;
 
-    for (NGramEntry nge : entries) {
-      System.out.println(nge);
-      assertEquals(counts[c], nge.getCount());
-      c++;
+        for (NGramEntry nge : entries) {
+            System.out.println(nge);
+            assertEquals(counts[c], nge.getCount());
+            c++;
+        }
     }
-  }
 
     public void testIdentify() {
         try {
             long total = 0;
             LanguageIdentifier idfr = new LanguageIdentifier();
             BufferedReader in = new BufferedReader(new InputStreamReader(
-                        this.getClass().getResourceAsStream("test-referencial.txt")));
+                    this.getClass().getResourceAsStream("test-referencial.txt")));
             String line = null;
             while((line = in.readLine()) != null) {
                 String[] tokens = line.split(";");
@@ -198,7 +192,7 @@
                     // Then, each line of the file...
                     BufferedReader testFile = new BufferedReader(
                             new InputStreamReader(
-                                this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
+                                    this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
                     String testLine = null;
                     while((testLine = testFile.readLine()) != null) {
                         testLine = testLine.trim();

Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java Fri Jul 31 07:43:34 2009
@@ -21,121 +21,115 @@
 import java.io.InputStream;
 import java.util.List;
 
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
 import junit.framework.TestCase;
 
-public class TestNGramProfile extends TestCase {
-
-  String tokencontent1 = "testaddtoken";
-  String tokencontent2 = "anotherteststring";
-
-  int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
-
-  String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
-
-
-  /**
-   * Test analyze method
-   */
-  public void testAnalyze() {
-    String tokencontent = "testmeagain";
-
-    NGramProfile p = new NGramProfile("test", 1, 1);
-    p.analyze(new StringBuilder(tokencontent));
-
-    //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
-    assertEquals(8, p.getSorted().size());
-  }
-
-  /**
-   * test getSorted method
-   */
-  public void testGetSorted() {
-    int[] count = { 4, 3, 1 };
-    String[] ngram = { "a", "b", "c" };
-
-    String teststring = "AAaaBbbC";
+import org.apache.tika.language.NGramProfile.NGramEntry;
 
-    NGramProfile p = new NGramProfile("test", 1, 1);
-    p.analyze(new StringBuilder(teststring));
+public class TestNGramProfile extends TestCase {
 
-    //test size of profile
-    assertEquals(3, p.getSorted().size());
+    String tokencontent1 = "testaddtoken";
+    String tokencontent2 = "anotherteststring";
 
-    testCounts(p.getSorted(), count);
-    testContents(p.getSorted(), ngram);
+    int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
 
-  }
+    String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
 
-  public void testGetSimilarity() {
-    NGramProfile a = new NGramProfile("a", 1, 1);
-    NGramProfile b = new NGramProfile("b", 1, 1);
-    
-    a.analyze(new StringBuilder(tokencontent1));
-    b.analyze(new StringBuilder(tokencontent2));
+    /**
+     * Test analyze method
+     */
+    public void testAnalyze() {
+        String tokencontent = "testmeagain";
 
-    //because of rounding errors might slightly return different results
-    assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
+        NGramProfile p = new NGramProfile("test", 1, 1);
+        p.analyze(new StringBuilder(tokencontent));
 
-  }
+        //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
+        assertEquals(8, p.getSorted().size());
+    }
 
-  public void testExactMatch() {
-    NGramProfile a = new NGramProfile("a", 1, 1);
-    
-    a.analyze(new StringBuilder(tokencontent1));
+    /**
+     * test getSorted method
+     */
+    public void testGetSorted() {
+        int[] count = { 4, 3, 1 };
+        String[] ngram = { "a", "b", "c" };
+
+        String teststring = "AAaaBbbC";
 
-    assertEquals(a.getSimilarity(a), 0, 0);
+        NGramProfile p = new NGramProfile("test", 1, 1);
+        p.analyze(new StringBuilder(teststring));
 
-  }
+        //test size of profile
+        assertEquals(3, p.getSorted().size());
 
-  
-  public void testIO() {
-    //Create profile and set some contents
-    NGramProfile a = new NGramProfile("a", 1, 1);
-    a.analyze(new StringBuilder(this.tokencontent1));
+        testCounts(p.getSorted(), count);
+        testContents(p.getSorted(), ngram);
+    }
 
-    NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
+    public void testGetSimilarity() {
+        NGramProfile a = new NGramProfile("a", 1, 1);
+        NGramProfile b = new NGramProfile("b", 1, 1);
 
-    //save profile
-    ByteArrayOutputStream os = new ByteArrayOutputStream();
+        a.analyze(new StringBuilder(tokencontent1));
+        b.analyze(new StringBuilder(tokencontent2));
 
-    try {
-      a.save(os);
-      os.close();
-    } catch (Exception e) {
-      fail();
+        //because of rounding errors might slightly return different results
+        assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
     }
 
-    //load profile
-    InputStream is = new ByteArrayInputStream(os.toByteArray());
-    try {
-      b.load(is);
-      is.close();
-    } catch (Exception e) {
-      fail();
+    public void testExactMatch() {
+        NGramProfile a = new NGramProfile("a", 1, 1);
+        a.analyze(new StringBuilder(tokencontent1));
+        assertEquals(a.getSimilarity(a), 0, 0);
     }
 
-    //check it
-    testCounts(b.getSorted(), counts1);
-    testContents(b.getSorted(), chars1);
-  }
+    public void testIO() {
+        //Create profile and set some contents
+        NGramProfile a = new NGramProfile("a", 1, 1);
+        a.analyze(new StringBuilder(this.tokencontent1));
+
+        NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
+
+        //save profile
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+
+        try {
+            a.save(os);
+            os.close();
+        } catch (Exception e) {
+            fail();
+        }
+
+        //load profile
+        InputStream is = new ByteArrayInputStream(os.toByteArray());
+        try {
+            b.load(is);
+            is.close();
+        } catch (Exception e) {
+            fail();
+        }
+
+        //check it
+        testCounts(b.getSorted(), counts1);
+        testContents(b.getSorted(), chars1);
+    }
 
-  private void testContents(List<NGramEntry> entries, String contents[]) {
-    int c = 0;
+    private void testContents(List<NGramEntry> entries, String contents[]) {
+        int c = 0;
 
-    for (NGramEntry nge : entries) {
-      assertEquals(contents[c], nge.getSeq().toString());
-      c++;
+        for (NGramEntry nge : entries) {
+            assertEquals(contents[c], nge.getSeq().toString());
+            c++;
+        }
     }
-  }
 
-  private void testCounts(List<NGramEntry> entries, int counts[]) {
-    int c = 0;
+    private void testCounts(List<NGramEntry> entries, int counts[]) {
+        int c = 0;
 
-    for (NGramEntry nge : entries) {
-      assertEquals(counts[c], nge.getCount());
-      c++;
+        for (NGramEntry nge : entries) {
+            assertEquals(counts[c], nge.getCount());
+            c++;
+        }
     }
-  }
+
 }