You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/31 09:43:34 UTC
svn commit: r799534 - in /lucene/tika/trunk/tika-core/src:
main/java/org/apache/tika/language/ test/java/org/apache/tika/language/
Author: jukka
Date: Fri Jul 31 07:43:34 2009
New Revision: 799534
URL: http://svn.apache.org/viewvc?rev=799534&view=rev
Log:
TIKA-209: Language detection is weak.
Re-indent to match Java coding conventions.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Jul 31 07:43:34 2009
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,21 +16,20 @@
*/
package org.apache.tika.language;
-// JDK imports
-import java.io.File;
-import java.io.InputStream;
-import java.io.IOException;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
-import java.util.List;
-import java.util.Vector;
+import java.util.ArrayList;
+import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
-import java.util.ArrayList;
+import java.util.List;
import java.util.Properties;
-import java.util.Enumeration;
+import java.util.Vector;
import org.apache.tika.language.NGramProfile.NGramEntry;
@@ -44,350 +43,337 @@
* @author Jérôme Charron
*/
public class LanguageIdentifier {
-
-
- private final static int DEFAULT_ANALYSIS_LENGTH = 0; // 0 means full content
-
- private ArrayList<NGramProfile> languages = new ArrayList<NGramProfile>();
-
- private ArrayList<String> supportedLanguages = new ArrayList<String>();
-
- /** Minimum size of NGrams */
- private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
-
- /** Maximum size of NGrams */
- private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
-
- /** The maximum amount of data to analyze */
- private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
-
- /** A global index of ngrams of all supported languages */
- private HashMap<CharSequence, NGramEntry[]> ngramsIdx = new HashMap<CharSequence, NGramEntry[]>();
-
- /** The NGramProfile used for identification */
- private NGramProfile suspect = null;
-
-
- /**
- * Constructs a new Language Identifier.
- */
- public LanguageIdentifier() {
-
- // Gets ngram sizes to take into account from the Nutch Config
- minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
- maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
- // Ensure the min and max values are in an acceptale range
- // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
- maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
- maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
- minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
- minLength = Math.min(minLength, maxLength);
-
- // Gets the value of the maximum size of data to analyze
- analyzeLength = DEFAULT_ANALYSIS_LENGTH;
-
- Properties p = new Properties();
- try {
- p.load(this.getClass().getResourceAsStream("langmappings.properties"));
-
- Enumeration alllanguages = p.keys();
-
- StringBuffer list = new StringBuffer("Language identifier plugin supports:");
- HashMap<NGramEntry, List<NGramEntry>> tmpIdx = new HashMap<NGramEntry, List<NGramEntry>>();
- while (alllanguages.hasMoreElements()) {
- String lang = (String) (alllanguages.nextElement());
-
- InputStream is = this.getClass().getClassLoader().getResourceAsStream(
- "org/apache/tika/language/" + lang + "." + NGramProfile.FILE_EXTENSION);
-
- if (is != null) {
- NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
- try {
- profile.load(is);
- languages.add(profile);
- supportedLanguages.add(lang);
- List<NGramEntry> ngrams = profile.getSorted();
- for (int i=0; i<ngrams.size(); i++) {
- NGramEntry entry = ngrams.get(i);
- List<NGramEntry> registered = tmpIdx.get(entry);
- if (registered == null) {
- registered = new ArrayList<NGramEntry>();
- tmpIdx.put(entry, registered);
+
+ private final static int DEFAULT_ANALYSIS_LENGTH = 0; // 0 means full content
+
+ private ArrayList<NGramProfile> languages = new ArrayList<NGramProfile>();
+
+ private ArrayList<String> supportedLanguages = new ArrayList<String>();
+
+ /** Minimum size of NGrams */
+ private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+
+ /** Maximum size of NGrams */
+ private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+
+ /** The maximum amount of data to analyze */
+ private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+
+ /** A global index of ngrams of all supported languages */
+ private HashMap<CharSequence, NGramEntry[]> ngramsIdx = new HashMap<CharSequence, NGramEntry[]>();
+
+ /** The NGramProfile used for identification */
+ private NGramProfile suspect = null;
+
+ /**
+ * Constructs a new Language Identifier.
+ */
+ public LanguageIdentifier() {
+
+ // Gets ngram sizes to take into account from the Nutch Config
+ minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+ maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+ // Ensure the min and max values are in an acceptale range
+ // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
+ maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
+ maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+ minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+ minLength = Math.min(minLength, maxLength);
+
+ // Gets the value of the maximum size of data to analyze
+ analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+
+ Properties p = new Properties();
+ try {
+ p.load(this.getClass().getResourceAsStream("langmappings.properties"));
+
+ Enumeration alllanguages = p.keys();
+
+ StringBuffer list = new StringBuffer("Language identifier plugin supports:");
+ HashMap<NGramEntry, List<NGramEntry>> tmpIdx = new HashMap<NGramEntry, List<NGramEntry>>();
+ while (alllanguages.hasMoreElements()) {
+ String lang = (String) (alllanguages.nextElement());
+
+ InputStream is = this.getClass().getClassLoader().getResourceAsStream(
+ "org/apache/tika/language/" + lang + "." + NGramProfile.FILE_EXTENSION);
+
+ if (is != null) {
+ NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
+ try {
+ profile.load(is);
+ languages.add(profile);
+ supportedLanguages.add(lang);
+ List<NGramEntry> ngrams = profile.getSorted();
+ for (int i=0; i<ngrams.size(); i++) {
+ NGramEntry entry = ngrams.get(i);
+ List<NGramEntry> registered = tmpIdx.get(entry);
+ if (registered == null) {
+ registered = new ArrayList<NGramEntry>();
+ tmpIdx.put(entry, registered);
+ }
+ registered.add(entry);
+ entry.setProfile(profile);
+ }
+ list.append(" " + lang + "(" + ngrams.size() + ")");
+ is.close();
+ } catch (IOException e1) {
+ // if (LOG.isFatalEnabled()) { LOG.fatal(e1.toString()); }
+ }
}
- registered.add(entry);
- entry.setProfile(profile);
}
- list.append(" " + lang + "(" + ngrams.size() + ")");
- is.close();
- } catch (IOException e1) {
- // if (LOG.isFatalEnabled()) { LOG.fatal(e1.toString()); }
- }
- }
- }
- // transform all ngrams lists to arrays for performances
- Iterator<NGramEntry> keys = tmpIdx.keySet().iterator();
- while (keys.hasNext()) {
- NGramEntry entry = keys.next();
- List<NGramEntry> l = tmpIdx.get(entry);
- if (l != null) {
- NGramEntry[] array = l.toArray(new NGramEntry[l.size()]);
- ngramsIdx.put(entry.getSeq(), array);
+ // transform all ngrams lists to arrays for performances
+ Iterator<NGramEntry> keys = tmpIdx.keySet().iterator();
+ while (keys.hasNext()) {
+ NGramEntry entry = keys.next();
+ List<NGramEntry> l = tmpIdx.get(entry);
+ if (l != null) {
+ NGramEntry[] array = l.toArray(new NGramEntry[l.size()]);
+ ngramsIdx.put(entry.getSeq(), array);
+ }
+ }
+ // Create the suspect profile
+ suspect = new NGramProfile("suspect", minLength, maxLength);
+ } catch (Exception e) {
+ e.printStackTrace();
+ // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
}
- }
- // Create the suspect profile
- suspect = new NGramProfile("suspect", minLength, maxLength);
- } catch (Exception e) {
- e.printStackTrace();
- // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
}
- }
+ /**
+ * Main method used for command line process.
+ * <br/>Usage is:
+ * <pre>
+ * LanguageIdentifier [-identifyrows filename maxlines]
+ * [-identifyfile charset filename]
+ * [-identifyfileset charset files]
+ * [-identifytext text]
+ * [-identifyurl url]
+ * </pre>
+ * @param args arguments.
+ */
+ public static void main(String args[]) {
+ String usage = "Usage: LanguageIdentifier"
+ + " [-identifyrows filename maxlines]"
+ + " [-identifyfile charset filename]"
+ + " [-identifyfileset charset files]"
+ + " [-identifytext text] ";
+ int command = 0;
+
+ final int IDFILE = 1;
+ final int IDTEXT = 2;
+ final int IDFILESET = 4;
+ final int IDROWS = 5;
+
+ Vector<String> fileset = new Vector<String>();
+ String filename = "";
+ String charset = "";
+ String text = "";
+ int max = 0;
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
- /**
- * Main method used for command line process.
- * <br/>Usage is:
- * <pre>
- * LanguageIdentifier [-identifyrows filename maxlines]
- * [-identifyfile charset filename]
- * [-identifyfileset charset files]
- * [-identifytext text]
- * [-identifyurl url]
- * </pre>
- * @param args arguments.
- */
- public static void main(String args[]) {
-
- String usage = "Usage: LanguageIdentifier " +
- "[-identifyrows filename maxlines] " +
- "[-identifyfile charset filename] " +
- "[-identifyfileset charset files] " +
- "[-identifytext text] ";
- int command = 0;
-
- final int IDFILE = 1;
- final int IDTEXT = 2;
- final int IDFILESET = 4;
- final int IDROWS = 5;
-
- Vector<String> fileset = new Vector<String>();
- String filename = "";
- String charset = "";
- String text = "";
- int max = 0;
-
- if (args.length == 0) {
- System.err.println(usage);
- System.exit(-1);
- }
+ for (int i = 0; i < args.length; i++) { // parse command line
+ if (args[i].equals("-identifyfile")) {
+ command = IDFILE;
+ charset = args[++i];
+ filename = args[++i];
+ }
- for (int i = 0; i < args.length; i++) { // parse command line
- if (args[i].equals("-identifyfile")) {
- command = IDFILE;
- charset = args[++i];
- filename = args[++i];
- }
-
- if (args[i].equals("-identifyrows")) {
- command = IDROWS;
- filename = args[++i];
- max = Integer.parseInt(args[++i]);
- }
-
- if (args[i].equals("-identifytext")) {
- command = IDTEXT;
- for (i++; i < args.length - 1; i++)
- text += args[i] + " ";
- }
-
- if (args[i].equals("-identifyfileset")) {
- command = IDFILESET;
- charset = args[++i];
- for (i++; i < args.length; i++) {
- File[] files = null;
- File f = new File(args[i]);
- if (f.isDirectory()) {
- files = f.listFiles();
- } else {
- files = new File[] { f };
- }
- for (int j=0; j<files.length; j++) {
- fileset.add(files[j].getAbsolutePath());
- }
- }
- }
+ if (args[i].equals("-identifyrows")) {
+ command = IDROWS;
+ filename = args[++i];
+ max = Integer.parseInt(args[++i]);
+ }
- }
+ if (args[i].equals("-identifytext")) {
+ command = IDTEXT;
+ for (i++; i < args.length - 1; i++)
+ text += args[i] + " ";
+ }
- String lang = null;
- //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
- LanguageIdentifier idfr = new LanguageIdentifier();
- File f;
- FileInputStream fis;
- try {
- switch (command) {
-
- case IDTEXT:
- lang = idfr.identify(text);
- break;
-
- case IDFILE:
- f = new File(filename);
- fis = new FileInputStream(f);
- lang = idfr.identify(fis, charset);
- fis.close();
- break;
-
- case IDROWS:
- f = new File(filename);
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
- String line;
- while (max > 0 && (line = br.readLine()) != null) {
- line = line.trim();
- if (line.length() > 2) {
- max--;
- lang = idfr.identify(line);
- System.out.println("R=" + lang + ":" + line);
+ if (args[i].equals("-identifyfileset")) {
+ command = IDFILESET;
+ charset = args[++i];
+ for (i++; i < args.length; i++) {
+ File[] files = null;
+ File f = new File(args[i]);
+ if (f.isDirectory()) {
+ files = f.listFiles();
+ } else {
+ files = new File[] { f };
+ }
+ for (int j=0; j<files.length; j++) {
+ fileset.add(files[j].getAbsolutePath());
+ }
+ }
}
- }
- br.close();
- System.exit(0);
- break;
-
- case IDFILESET:
- /* used for benchs
- for (int j=128; j<=524288; j*=2) {
- long start = System.currentTimeMillis();
- idfr.analyzeLength = j; */
- System.out.println("FILESET");
- Iterator<String> i = fileset.iterator();
- while (i.hasNext()) {
- try {
- filename = i.next();
- f = new File(filename);
- fis = new FileInputStream(f);
- lang = idfr.identify(fis, charset);
- fis.close();
- } catch (Exception e) {
- System.out.println(e);
+ }
+
+ String lang = null;
+ //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+ LanguageIdentifier idfr = new LanguageIdentifier();
+ File f;
+ FileInputStream fis;
+ try {
+ switch (command) {
+
+ case IDTEXT:
+ lang = idfr.identify(text);
+ break;
+
+ case IDFILE:
+ f = new File(filename);
+ fis = new FileInputStream(f);
+ lang = idfr.identify(fis, charset);
+ fis.close();
+ break;
+
+ case IDROWS:
+ f = new File(filename);
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
+ String line;
+ while (max > 0 && (line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() > 2) {
+ max--;
+ lang = idfr.identify(line);
+ System.out.println("R=" + lang + ":" + line);
+ }
+ }
+
+ br.close();
+ System.exit(0);
+ break;
+
+ case IDFILESET:
+ System.out.println("FILESET");
+ Iterator<String> i = fileset.iterator();
+ while (i.hasNext()) {
+ try {
+ filename = i.next();
+ f = new File(filename);
+ fis = new FileInputStream(f);
+ lang = idfr.identify(fis, charset);
+ fis.close();
+ } catch (Exception e) {
+ System.out.println(e);
+ }
+ System.out.println(filename + " was identified as " + lang);
+ }
+ System.exit(0);
+ break;
}
- System.out.println(filename + " was identified as " + lang);
- }
- /* used for benchs
- System.out.println(j + "/" + (System.currentTimeMillis()-start));
- } */
- System.exit(0);
- break;
- }
- } catch (Exception e) {
- System.out.println(e);
+ } catch (Exception e) {
+ System.out.println(e);
+ }
+ System.out.println("text was identified as " + lang);
}
- System.out.println("text was identified as " + lang);
- }
- /**
- * Identify language of a content.
- *
- * @param content is the content to analyze.
- * @return The 2 letter
- * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
- * language code</a> (en, fi, sv, ...) of the language that best
- * matches the specified content.
- */
- public String identify(String content) {
- return identify(new StringBuilder(content));
- }
-
- /**
- * Identify language of a content.
- *
- * @param content is the content to analyze.
- * @return The 2 letter
- * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
- * language code</a> (en, fi, sv, ...) of the language that best
- * matches the specified content.
- */
- public String identify(StringBuilder content) {
-
- StringBuilder text = content;
- if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
- text = new StringBuilder().append(content);
- text.setLength(analyzeLength);
+ /**
+ * Identify language of a content.
+ *
+ * @param content is the content to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the specified content.
+ */
+ public String identify(String content) {
+ return identify(new StringBuilder(content));
}
- suspect.analyze(text);
- Iterator<NGramEntry> iter = suspect.getSorted().iterator();
- float topscore = Float.MIN_VALUE;
- String lang = "";
- HashMap<NGramProfile, Float> scores = new HashMap<NGramProfile, Float>();
- NGramEntry searched = null;
-
- while (iter.hasNext()) {
- searched = iter.next();
- NGramEntry[] ngrams = ngramsIdx.get(searched.getSeq());
- if (ngrams != null) {
- for (int j=0; j<ngrams.length; j++) {
- NGramProfile profile = ngrams[j].getProfile();
- Float pScore = scores.get(profile);
- if (pScore == null) {
- pScore = new Float(0);
- }
- float plScore = pScore.floatValue();
- plScore += ngrams[j].getFrequency() + searched.getFrequency();
- scores.put(profile, new Float(plScore));
- if (plScore > topscore) {
- topscore = plScore;
- lang = profile.getName();
+ /**
+ * Identify language of a content.
+ *
+ * @param content is the content to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the specified content.
+ */
+ public String identify(StringBuilder content) {
+ StringBuilder text = content;
+ if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
+ text = new StringBuilder().append(content);
+ text.setLength(analyzeLength);
+ }
+
+ suspect.analyze(text);
+ Iterator<NGramEntry> iter = suspect.getSorted().iterator();
+ float topscore = Float.MIN_VALUE;
+ String lang = "";
+ HashMap<NGramProfile, Float> scores = new HashMap<NGramProfile, Float>();
+ NGramEntry searched = null;
+
+ while (iter.hasNext()) {
+ searched = iter.next();
+ NGramEntry[] ngrams = ngramsIdx.get(searched.getSeq());
+ if (ngrams != null) {
+ for (int j=0; j<ngrams.length; j++) {
+ NGramProfile profile = ngrams[j].getProfile();
+ Float pScore = scores.get(profile);
+ if (pScore == null) {
+ pScore = new Float(0);
+ }
+ float plScore = pScore.floatValue();
+ plScore += ngrams[j].getFrequency() + searched.getFrequency();
+ scores.put(profile, new Float(plScore));
+ if (plScore > topscore) {
+ topscore = plScore;
+ lang = profile.getName();
+ }
}
}
}
+ return lang;
+ }
+
+ /**
+ * Identify language from input stream.
+ * This method uses the platform default encoding to read the input stream.
+ * For using a specific encoding, use the
+ * {@link #identify(InputStream, String)} method.
+ *
+ * @param is is the input stream to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the content of the specified input stream.
+ * @throws IOException if something wrong occurs on the input stream.
+ */
+ public String identify(InputStream is) throws IOException {
+ return identify(is, null);
}
- return lang;
- }
- /**
- * Identify language from input stream.
- * This method uses the platform default encoding to read the input stream.
- * For using a specific encoding, use the
- * {@link #identify(InputStream, String)} method.
- *
- * @param is is the input stream to analyze.
- * @return The 2 letter
- * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
- * language code</a> (en, fi, sv, ...) of the language that best
- * matches the content of the specified input stream.
- * @throws IOException if something wrong occurs on the input stream.
- */
- public String identify(InputStream is) throws IOException {
- return identify(is, null);
- }
-
- /**
- * Identify language from input stream.
- *
- * @param is is the input stream to analyze.
- * @param charset is the charset to use to read the input stream.
- * @return The 2 letter
- * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
- * language code</a> (en, fi, sv, ...) of the language that best
- * matches the content of the specified input stream.
- * @throws IOException if something wrong occurs on the input stream.
- */
- public String identify(InputStream is, String charset) throws IOException {
-
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- byte[] buffer = new byte[2048];
- int len = 0;
-
- while (((len = is.read(buffer)) != -1) &&
- ((analyzeLength == 0) || (out.size() < analyzeLength))) {
- if (analyzeLength != 0) {
- len = Math.min(len, analyzeLength - out.size());
- }
- out.write(buffer, 0, len);
+ /**
+ * Identify language from input stream.
+ *
+ * @param is is the input stream to analyze.
+ * @param charset is the charset to use to read the input stream.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the content of the specified input stream.
+ * @throws IOException if something wrong occurs on the input stream.
+ */
+ public String identify(InputStream is, String charset) throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ byte[] buffer = new byte[2048];
+ int len = 0;
+
+ while (((len = is.read(buffer)) != -1) &&
+ ((analyzeLength == 0) || (out.size() < analyzeLength))) {
+ if (analyzeLength != 0) {
+ len = Math.min(len, analyzeLength - out.size());
+ }
+ out.write(buffer, 0, len);
+ }
+ return identify((charset == null) ? out.toString()
+ : out.toString(charset));
}
- return identify((charset == null) ? out.toString()
- : out.toString(charset));
- }
}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java Fri Jul 31 07:43:34 2009
@@ -16,22 +16,21 @@
*/
package org.apache.tika.language;
-// JDK imports
-import java.io.File;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.OutputStream;
+import java.io.BufferedInputStream;
import java.io.BufferedReader;
+import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.BufferedInputStream;
-import java.util.Date;
-import java.util.List;
-import java.util.Iterator;
+import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Date;
import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
import java.util.Map;
/**
@@ -47,671 +46,651 @@
*/
public class NGramProfile {
- /** The minimum length allowed for a ngram. */
- final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
+ /** The minimum length allowed for a ngram. */
+ final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
- /** The maximum length allowed for a ngram. */
- final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
-
- /** The default min length of ngram */
- final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
-
- /** The default max length of ngram */
- final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
-
- /** The ngram profile file extension */
- static final String FILE_EXTENSION = "ngp";
-
- /** The profile max size (number of ngrams of the same size) */
- static final int MAX_SIZE = 1000;
-
- /** separator char */
- static final char SEPARATOR = '_';
- /** The String form of the separator char */
- private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
-
-
- /** The profile's name */
- private String name = null;
-
- /** The NGrams of this profile sorted on the number of occurences */
- private List<NGramEntry> sorted = null;
-
- /** The min length of ngram */
- private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
-
- /** The max length of ngram */
- private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
-
- /** The total number of ngrams occurences */
- private int[] ngramcounts = null;
-
- /** An index of the ngrams of the profile */
- private Map<CharSequence, NGramEntry> ngrams = null;
-
- /** A StringBuffer used during analysis */
- private QuickStringBuffer word = new QuickStringBuffer();
-
-
- /**
- * Construct a new ngram profile
- *
- * @param name is the name of the profile
- * @param minlen is the min length of ngram sequences
- * @param maxlen is the max length of ngram sequences
- */
- public NGramProfile(String name, int minlen, int maxlen) {
- // TODO: Compute the initial capacity using minlen and maxlen.
- this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
- this.minLength = minlen;
- this.maxLength = maxlen;
- this.name = name;
- }
-
- /**
- * @return Returns the name.
- */
- public String getName() {
- return name;
- }
-
- /**
- * Add ngrams from a single word to this profile
- *
- * @param word is the word to add
- */
- public void add(StringBuffer word) {
- for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
- add(word, i);
- }
- }
-
- /**
- * Add the last NGrams from the specified word.
- */
- private void add(QuickStringBuffer word) {
- int wlen = word.length();
- if (wlen >= minLength) {
- int max = Math.min(maxLength, wlen);
- for (int i=minLength; i<=max; i++) {
- add(word.subSequence(wlen-i, wlen));
- }
- }
- }
-
- /**
- * Add ngrams from a single word in this profile
- *
- * @param word is the word to add
- * @param n is the ngram size
- */
- private void add(CharSequence cs) {
-
- if (cs.equals(SEP_CHARSEQ)) { return; }
- NGramEntry nge = ngrams.get(cs);
- if (nge == null) {
- nge = new NGramEntry(cs);
- ngrams.put(cs, nge);
- }
- nge.inc();
- }
-
- /**
- * Analyze a piece of text
- *
- * @param text the text to be analyzed
- */
- public void analyze(StringBuilder text) {
-
- if (ngrams != null) {
- ngrams.clear();
- sorted = null;
- ngramcounts = null;
- }
-
- word.clear().append(SEPARATOR);
- for (int i = 0; i < text.length(); i++) {
- char c = Character.toLowerCase(text.charAt(i));
-
- if (Character.isLetter(c)) {
- add(word.append(c));
- } else {
- //found word boundary
- if (word.length() > 1) {
- //we have a word!
- add(word.append(SEPARATOR));
- word.clear().append(SEPARATOR);
- }
- }
- }
-
- if (word.length() > 1) {
- //we have a word!
- add(word.append(SEPARATOR));
- }
- normalize();
- }
-
- /**
- * @param word
- * @param n sequence length
- */
- private void add(StringBuffer word, int n) {
- for (int i=0; i <= word.length()-n; i++) {
- add(word.subSequence(i, i + n));
- }
- }
-
- /**
- * Normalize the profile (calculates the ngrams frequencies)
- */
- protected void normalize() {
-
- NGramEntry e = null;
- //List sorted = getSorted();
- Iterator<NGramEntry> i = ngrams.values().iterator();
-
- // Calculate ngramcount if not already done
- if (ngramcounts == null) {
- ngramcounts = new int[maxLength+1];
- while (i.hasNext()) {
- e = i.next();
- ngramcounts[e.size()] += e.count;
- }
- }
-
- i = ngrams.values().iterator();
- while (i.hasNext()) {
- e = i.next();
- e.frequency = (float) e.count / (float) ngramcounts[e.size()];
- }
- }
-
- /**
- * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
- *
- * @return sorted vector of ngrams
- */
- public List<NGramEntry> getSorted() {
- // make sure sorting is done only once
- if (sorted == null) {
- sorted = new ArrayList<NGramEntry>(ngrams.values());
- Collections.sort(sorted);
-
- // trim at NGRAM_LENGTH entries
- if (sorted.size() > MAX_SIZE) {
- sorted = sorted.subList(0, MAX_SIZE);
- }
- }
- return sorted;
- }
-
- // Inherited JavaDoc
- public String toString() {
-
- StringBuffer s = new StringBuffer().append("NGramProfile: ")
- .append(name).append("\n");
-
- Iterator<NGramEntry> i = getSorted().iterator();
-
- while (i.hasNext()) {
- NGramEntry entry = i.next();
- s.append("[").append(entry.seq)
- .append("/").append(entry.count)
- .append("/").append(entry.frequency).append("]\n");
- }
- return s.toString();
- }
-
- /**
- * Calculate a score how well NGramProfiles match each other
- *
- * @param another
- * ngram profile to compare against
- * @return similarity 0=exact match
- */
- public float getSimilarity(NGramProfile another) {
-
- float sum = 0;
-
- try {
- Iterator<NGramEntry> i = another.getSorted().iterator();
- while (i.hasNext()) {
- NGramEntry other = i.next();
- if (ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.frequency -
- ngrams.get(other.seq).frequency)) / 2;
- } else {
- sum += other.frequency;
- }
- }
- i = getSorted().iterator();
- while (i.hasNext()) {
- NGramEntry other = i.next();
- if (another.ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.frequency -
- another.ngrams.get(other.seq).frequency)) / 2;
- } else {
- sum += other.frequency;
- }
- }
- } catch (Exception e) {
- // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
- }
- return sum;
- }
-
- /**
- * Loads a ngram profile from an InputStream
- * (assumes UTF-8 encoded content)
- * @param is the InputStream to read
- */
- public void load(InputStream is) throws IOException {
-
- ngrams.clear();
- ngramcounts = new int[maxLength+1];
- BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
- String line = null;
-
- while ((line = reader.readLine()) != null) {
-
- // # starts a comment line
- if (line.charAt(0) != '#') {
- int spacepos = line.indexOf(' ');
- String ngramsequence = line.substring(0, spacepos).trim();
- int len = ngramsequence.length();
- if ((len >= minLength) && (len <= maxLength)) {
- int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
- NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
- ngrams.put(en.getSeq(), en);
- ngramcounts[len] += ngramcount;
- }
- }
- }
- normalize();
- }
-
- /**
- * Create a new Language profile from (preferably quite large) text file
- *
- * @param name is thename of profile
- * @param is is the stream to read
- * @param encoding is the encoding of stream
- */
- public static NGramProfile create(String name, InputStream is, String encoding) {
-
- NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
- ABSOLUTE_MAX_NGRAM_LENGTH);
- BufferedInputStream bis = new BufferedInputStream(is);
-
- byte buffer[] = new byte[4096];
- StringBuilder text = new StringBuilder();
- int len;
-
- try {
- while ((len = bis.read(buffer)) != -1) {
- text.append(new String(buffer, 0, len, encoding));
- }
- } catch (IOException e) {
- // e.printStackTrace(LogUtil.getWarnStream(LOG));
- }
-
- newProfile.analyze(text);
- return newProfile;
- }
-
- /**
- * Writes NGramProfile content into OutputStream, content is outputted with
- * UTF-8 encoding
- *
- * @param os the Stream to output to
- * @throws IOException
- */
- public void save(OutputStream os) throws IOException {
-
- // Write header
- os.write(("# NgramProfile generated at " + new Date() +
- " for Nutch Language Identification\n").getBytes());
-
- // And then each ngram
-
- // First dispatch ngrams in many lists depending on their size
- // (one list for each size, in order to store MAX_SIZE ngrams for each
- // size of ngram)
- List<NGramEntry> list = new ArrayList<NGramEntry>();
- List<NGramEntry> sublist = new ArrayList<NGramEntry>();
- NGramEntry[] entries = ngrams.values().toArray(new NGramEntry[ngrams.size()]);
- for (int i=minLength; i<=maxLength; i++) {
- for (int j=0; j<entries.length; j++) {
- if (entries[j].getSeq().length() == i) {
- sublist.add(entries[j]);
- }
- }
- Collections.sort(sublist);
- if (sublist.size() > MAX_SIZE) {
- sublist = sublist.subList(0, MAX_SIZE);
- }
- list.addAll(sublist);
- sublist.clear();
- }
- for (int i=0; i<list.size(); i++) {
- NGramEntry e = list.get(i);
- String line = e.toString() + " " + e.getCount() + "\n";
- os.write(line.getBytes("UTF-8"));
- }
- os.flush();
- }
-
- /**
- * main method used for testing only
- *
- * @param args
- */
- public static void main(String args[]) {
-
- String usage = "Usage: NGramProfile " +
- "[-create profilename filename encoding] " +
- "[-similarity file1 file2] "+
- "[-score profile-name filename encoding]";
- int command = 0;
-
- final int CREATE = 1;
- final int SIMILARITY = 2;
- final int SCORE = 3;
-
- String profilename = "";
- String filename = "";
- String filename2 = "";
- String encoding = "";
-
- if (args.length == 0) {
- System.err.println(usage);
- System.exit(-1);
- }
-
- for (int i = 0; i < args.length; i++) { // parse command line
- if (args[i].equals("-create")) { // found -create option
- command = CREATE;
- profilename = args[++i];
- filename = args[++i];
- encoding = args[++i];
- }
-
- if (args[i].equals("-similarity")) { // found -similarity option
- command = SIMILARITY;
- filename = args[++i];
- filename2 = args[++i];
- encoding = args[++i];
- }
-
- if (args[i].equals("-score")) { // found -Score option
- command = SCORE;
- profilename = args[++i];
- filename = args[++i];
- encoding = args[++i];
- }
- }
-
- try {
-
- switch (command) {
-
- case CREATE:
-
- File f = new File(filename);
- FileInputStream fis = new FileInputStream(f);
- NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
- fis.close();
- f = new File(profilename + "." + FILE_EXTENSION);
- FileOutputStream fos = new FileOutputStream(f);
- newProfile.save(fos);
- System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
- break;
-
- case SIMILARITY:
-
- f = new File(filename);
- fis = new FileInputStream(f);
- newProfile = NGramProfile.create(filename, fis, encoding);
- newProfile.normalize();
-
- f = new File(filename2);
- fis = new FileInputStream(f);
- NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
- newProfile2.normalize();
- System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
- break;
-
- case SCORE:
- f = new File(filename);
- fis = new FileInputStream(f);
- newProfile = NGramProfile.create(filename, fis, encoding);
-
- f = new File(profilename + "." + FILE_EXTENSION);
- fis = new FileInputStream(f);
- NGramProfile compare = new NGramProfile(profilename,
- DEFAULT_MIN_NGRAM_LENGTH,
- DEFAULT_MAX_NGRAM_LENGTH);
- compare.load(fis);
- System.out.println("Score is " + compare.getSimilarity(newProfile));
- break;
-
- }
-
- } catch (Exception e) {
- }
- }
-
-
- /**
- * Inner class that describes a NGram
- */
- class NGramEntry implements Comparable<NGramEntry> {
-
- /** The NGRamProfile this NGram is related to */
- private NGramProfile profile = null;
-
- /** The sequence of characters of the ngram */
- CharSequence seq = null;
-
- /** The number of occurences of this ngram in its profile */
- private int count = 0;
-
- /** The frequency of this ngram in its profile */
- private float frequency = 0.0F;
-
-
- /**
- * Constructs a new NGramEntry
- * @param seq is the sequence of characters of the ngram
- */
- public NGramEntry(CharSequence seq) {
- this.seq = seq;
- }
-
- /**
- * Constructs a new NGramEntry
- * @param seq is the sequence of characters of the ngram
- * @param count is the number of occurences of this ngram
- */
- public NGramEntry(String seq, int count) {
- this.seq = new StringBuffer(seq).subSequence(0, seq.length());
- this.count = count;
- }
-
-
- /**
- * Returns the number of occurences of this ngram in its profile
- * @return the number of occurences of this ngram in its profile
- */
- public int getCount() {
- return count;
- }
-
- /**
- * Returns the frequency of this ngram in its profile
- * @return the frequency of this ngram in its profile
- */
- public float getFrequency() {
- return frequency;
- }
-
- /**
- * Returns the sequence of characters of this ngram
- * @return the sequence of characters of this ngram
- */
- public CharSequence getSeq() {
- return seq;
+ /** The maximum length allowed for a ngram. */
+ final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
+
+ /** The default min length of ngram */
+ final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
+
+ /** The default max length of ngram */
+ final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
+
+ /** The ngram profile file extension */
+ static final String FILE_EXTENSION = "ngp";
+
+ /** The profile max size (number of ngrams of the same size) */
+ static final int MAX_SIZE = 1000;
+
+ /** separator char */
+ static final char SEPARATOR = '_';
+ /** The String form of the separator char */
+ private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
+
+
+ /** The profile's name */
+ private String name = null;
+
+ /** The NGrams of this profile sorted on the number of occurences */
+ private List<NGramEntry> sorted = null;
+
+ /** The min length of ngram */
+ private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
+
+ /** The max length of ngram */
+ private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
+
+ /** The total number of ngrams occurences */
+ private int[] ngramcounts = null;
+
+ /** An index of the ngrams of the profile */
+ private Map<CharSequence, NGramEntry> ngrams = null;
+
+ /** A StringBuffer used during analysis */
+ private QuickStringBuffer word = new QuickStringBuffer();
+
+
+ /**
+ * Construct a new ngram profile
+ *
+ * @param name is the name of the profile
+ * @param minlen is the min length of ngram sequences
+ * @param maxlen is the max length of ngram sequences
+ */
+ public NGramProfile(String name, int minlen, int maxlen) {
+ // TODO: Compute the initial capacity using minlen and maxlen.
+ this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
+ this.minLength = minlen;
+ this.maxLength = maxlen;
+ this.name = name;
}
/**
- * Returns the size of this ngram
- * @return the size of this ngram
+ * @return Returns the name.
*/
- public int size() {
- return seq.length();
+ public String getName() {
+ return name;
}
-
- // Inherited JavaDoc
- public int compareTo(NGramEntry ngram) {
- int diff = Float.compare(ngram.getFrequency(), frequency);
- if (diff != 0) {
- return diff;
- } else {
- return (toString().compareTo(ngram.toString()));
- }
+
+ /**
+ * Add ngrams from a single word to this profile
+ *
+ * @param word is the word to add
+ */
+ public void add(StringBuffer word) {
+ for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
+ add(word, i);
+ }
}
/**
- * Increments the number of occurences of this ngram.
+ * Add the last NGrams from the specified word.
*/
- public void inc() {
- count++;
+ private void add(QuickStringBuffer word) {
+ int wlen = word.length();
+ if (wlen >= minLength) {
+ int max = Math.min(maxLength, wlen);
+ for (int i=minLength; i<=max; i++) {
+ add(word.subSequence(wlen-i, wlen));
+ }
+ }
}
/**
- * Associated a profile to this ngram
- * @param profile is the profile associated to this ngram
+ * Add ngrams from a single word in this profile
+ *
+ * @param word is the word to add
+ * @param n is the ngram size
*/
- public void setProfile(NGramProfile profile) {
- this.profile = profile;
+ private void add(CharSequence cs) {
+ if (cs.equals(SEP_CHARSEQ)) { return; }
+ NGramEntry nge = ngrams.get(cs);
+ if (nge == null) {
+ nge = new NGramEntry(cs);
+ ngrams.put(cs, nge);
+ }
+ nge.inc();
}
/**
- * Returns the profile associated to this ngram
- * @return the profile associated to this ngram
+ * Analyze a piece of text
+ *
+ * @param text the text to be analyzed
*/
- public NGramProfile getProfile() {
- return profile;
+ public void analyze(StringBuilder text) {
+ if (ngrams != null) {
+ ngrams.clear();
+ sorted = null;
+ ngramcounts = null;
+ }
+
+ word.clear().append(SEPARATOR);
+ for (int i = 0; i < text.length(); i++) {
+ char c = Character.toLowerCase(text.charAt(i));
+
+ if (Character.isLetter(c)) {
+ add(word.append(c));
+ } else {
+ //found word boundary
+ if (word.length() > 1) {
+ //we have a word!
+ add(word.append(SEPARATOR));
+ word.clear().append(SEPARATOR);
+ }
+ }
+ }
+
+ if (word.length() > 1) {
+ //we have a word!
+ add(word.append(SEPARATOR));
+ }
+ normalize();
}
- // Inherited JavaDoc
- public String toString() {
- return seq.toString();
+ /**
+ * @param word
+ * @param n sequence length
+ */
+ private void add(StringBuffer word, int n) {
+ for (int i=0; i <= word.length()-n; i++) {
+ add(word.subSequence(i, i + n));
+ }
}
- // Inherited JavaDoc
- public int hashCode() {
- return seq.hashCode();
+ /**
+ * Normalize the profile (calculates the ngrams frequencies)
+ */
+ protected void normalize() {
+ NGramEntry e = null;
+ //List sorted = getSorted();
+ Iterator<NGramEntry> i = ngrams.values().iterator();
+
+ // Calculate ngramcount if not already done
+ if (ngramcounts == null) {
+ ngramcounts = new int[maxLength+1];
+ while (i.hasNext()) {
+ e = i.next();
+ ngramcounts[e.size()] += e.count;
+ }
+ }
+
+ i = ngrams.values().iterator();
+ while (i.hasNext()) {
+ e = i.next();
+ e.frequency = (float) e.count / (float) ngramcounts[e.size()];
+ }
}
-
- // Inherited JavaDoc
- public boolean equals(Object obj) {
-
- NGramEntry ngram = null;
- try {
- ngram = (NGramEntry) obj;
- return ngram.seq.equals(seq);
- } catch (Exception e) {
- return false;
+
+ /**
+ * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
+ *
+ * @return sorted vector of ngrams
+ */
+ public List<NGramEntry> getSorted() {
+ // make sure sorting is done only once
+ if (sorted == null) {
+ sorted = new ArrayList<NGramEntry>(ngrams.values());
+ Collections.sort(sorted);
+
+ // trim at NGRAM_LENGTH entries
+ if (sorted.size() > MAX_SIZE) {
+ sorted = sorted.subList(0, MAX_SIZE);
+ }
}
+ return sorted;
}
- }
+ // Inherited JavaDoc
+ public String toString() {
+ StringBuffer s = new StringBuffer().append("NGramProfile: ")
+ .append(name).append("\n");
-
- private class QuickStringBuffer implements CharSequence {
+ Iterator<NGramEntry> i = getSorted().iterator();
- private char value[];
+ while (i.hasNext()) {
+ NGramEntry entry = i.next();
+ s.append("[").append(entry.seq)
+ .append("/").append(entry.count)
+ .append("/").append(entry.frequency).append("]\n");
+ }
+ return s.toString();
+ }
- private int count;
+ /**
+ * Calculate a score how well NGramProfiles match each other
+ *
+ * @param another
+ * ngram profile to compare against
+ * @return similarity 0=exact match
+ */
+ public float getSimilarity(NGramProfile another) {
+ float sum = 0;
- QuickStringBuffer() {
- this(16);
+ try {
+ Iterator<NGramEntry> i = another.getSorted().iterator();
+ while (i.hasNext()) {
+ NGramEntry other = i.next();
+ if (ngrams.containsKey(other.seq)) {
+ sum += Math.abs((other.frequency -
+ ngrams.get(other.seq).frequency)) / 2;
+ } else {
+ sum += other.frequency;
+ }
+ }
+ i = getSorted().iterator();
+ while (i.hasNext()) {
+ NGramEntry other = i.next();
+ if (another.ngrams.containsKey(other.seq)) {
+ sum += Math.abs((other.frequency -
+ another.ngrams.get(other.seq).frequency)) / 2;
+ } else {
+ sum += other.frequency;
+ }
+ }
+ } catch (Exception e) {
+ // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
+ }
+ return sum;
}
- QuickStringBuffer(char[] value) {
- this.value = value;
- count = value.length;
- }
-
- QuickStringBuffer(int length) {
- value = new char[length];
+ /**
+ * Loads a ngram profile from an InputStream
+ * (assumes UTF-8 encoded content)
+ * @param is the InputStream to read
+ */
+ public void load(InputStream is) throws IOException {
+ ngrams.clear();
+ ngramcounts = new int[maxLength+1];
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ String line = null;
+
+ while ((line = reader.readLine()) != null) {
+
+ // # starts a comment line
+ if (line.charAt(0) != '#') {
+ int spacepos = line.indexOf(' ');
+ String ngramsequence = line.substring(0, spacepos).trim();
+ int len = ngramsequence.length();
+ if ((len >= minLength) && (len <= maxLength)) {
+ int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+ NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+ ngrams.put(en.getSeq(), en);
+ ngramcounts[len] += ngramcount;
+ }
+ }
+ }
+ normalize();
}
- QuickStringBuffer(String str) {
- this(str.length() + 16);
- append(str);
- }
+ /**
+ * Create a new Language profile from (preferably quite large) text file
+ *
+ * @param name is thename of profile
+ * @param is is the stream to read
+ * @param encoding is the encoding of stream
+ */
+ public static NGramProfile create(String name, InputStream is, String encoding) {
+ NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
+ ABSOLUTE_MAX_NGRAM_LENGTH);
+ BufferedInputStream bis = new BufferedInputStream(is);
+
+ byte buffer[] = new byte[4096];
+ StringBuilder text = new StringBuilder();
+ int len;
- public int length() {
- return count;
- }
+ try {
+ while ((len = bis.read(buffer)) != -1) {
+ text.append(new String(buffer, 0, len, encoding));
+ }
+ } catch (IOException e) {
+ // e.printStackTrace(LogUtil.getWarnStream(LOG));
+ }
- private void expandCapacity(int minimumCapacity) {
- int newCapacity = (value.length + 1) * 2;
- if (newCapacity < 0) {
- newCapacity = Integer.MAX_VALUE;
- } else if (minimumCapacity > newCapacity) {
- newCapacity = minimumCapacity;
- }
-
- char newValue[] = new char[newCapacity];
- System.arraycopy(value, 0, newValue, 0, count);
- value = newValue;
+ newProfile.analyze(text);
+ return newProfile;
}
- QuickStringBuffer clear() {
- count = 0;
- return this;
+ /**
+ * Writes NGramProfile content into OutputStream, content is outputted with
+ * UTF-8 encoding
+ *
+ * @param os the Stream to output to
+ * @throws IOException
+ */
+ public void save(OutputStream os) throws IOException {
+ // Write header
+ os.write(("# NgramProfile generated at " + new Date() +
+ " for Nutch Language Identification\n").getBytes());
+
+ // And then each ngram
+
+ // First dispatch ngrams in many lists depending on their size
+ // (one list for each size, in order to store MAX_SIZE ngrams for each
+ // size of ngram)
+ List<NGramEntry> list = new ArrayList<NGramEntry>();
+ List<NGramEntry> sublist = new ArrayList<NGramEntry>();
+ NGramEntry[] entries = ngrams.values().toArray(new NGramEntry[ngrams.size()]);
+ for (int i=minLength; i<=maxLength; i++) {
+ for (int j=0; j<entries.length; j++) {
+ if (entries[j].getSeq().length() == i) {
+ sublist.add(entries[j]);
+ }
+ }
+ Collections.sort(sublist);
+ if (sublist.size() > MAX_SIZE) {
+ sublist = sublist.subList(0, MAX_SIZE);
+ }
+ list.addAll(sublist);
+ sublist.clear();
+ }
+ for (int i=0; i<list.size(); i++) {
+ NGramEntry e = list.get(i);
+ String line = e.toString() + " " + e.getCount() + "\n";
+ os.write(line.getBytes("UTF-8"));
+ }
+ os.flush();
}
- public char charAt(int index) {
- return value[index];
- }
+ /**
+ * main method used for testing only
+ *
+ * @param args
+ */
+ public static void main(String args[]) {
+ String usage = "Usage: NGramProfile"
+ + " [-create profilename filename encoding]"
+ + " [-similarity file1 file2]"
+ + " [-score profile-name filename encoding]";
+ int command = 0;
+
+ final int CREATE = 1;
+ final int SIMILARITY = 2;
+ final int SCORE = 3;
+
+ String profilename = "";
+ String filename = "";
+ String filename2 = "";
+ String encoding = "";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
- QuickStringBuffer append(String str) {
- if (str == null) {
- str = String.valueOf(str);
- }
+ for (int i = 0; i < args.length; i++) { // parse command line
+ if (args[i].equals("-create")) { // found -create option
+ command = CREATE;
+ profilename = args[++i];
+ filename = args[++i];
+ encoding = args[++i];
+ }
+
+ if (args[i].equals("-similarity")) { // found -similarity option
+ command = SIMILARITY;
+ filename = args[++i];
+ filename2 = args[++i];
+ encoding = args[++i];
+ }
+
+ if (args[i].equals("-score")) { // found -Score option
+ command = SCORE;
+ profilename = args[++i];
+ filename = args[++i];
+ encoding = args[++i];
+ }
+ }
- int len = str.length();
- int newcount = count + len;
- if (newcount > value.length) {
- expandCapacity(newcount);
- }
- str.getChars(0, len, value, count);
- count = newcount;
- return this;
+ try {
+ switch (command) {
+ case CREATE:
+ File f = new File(filename);
+ FileInputStream fis = new FileInputStream(f);
+ NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
+ fis.close();
+ f = new File(profilename + "." + FILE_EXTENSION);
+ FileOutputStream fos = new FileOutputStream(f);
+ newProfile.save(fos);
+ System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
+ break;
+
+ case SIMILARITY:
+ f = new File(filename);
+ fis = new FileInputStream(f);
+ newProfile = NGramProfile.create(filename, fis, encoding);
+ newProfile.normalize();
+
+ f = new File(filename2);
+ fis = new FileInputStream(f);
+ NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
+ newProfile2.normalize();
+ System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
+ break;
+
+ case SCORE:
+ f = new File(filename);
+ fis = new FileInputStream(f);
+ newProfile = NGramProfile.create(filename, fis, encoding);
+
+ f = new File(profilename + "." + FILE_EXTENSION);
+ fis = new FileInputStream(f);
+ NGramProfile compare = new NGramProfile(profilename,
+ DEFAULT_MIN_NGRAM_LENGTH,
+ DEFAULT_MAX_NGRAM_LENGTH);
+ compare.load(fis);
+ System.out.println("Score is " + compare.getSimilarity(newProfile));
+ break;
+ }
+ } catch (Exception e) {
+ }
}
- QuickStringBuffer append(char c) {
- int newcount = count + 1;
- if (newcount > value.length) {
- expandCapacity(newcount);
- }
- value[count++] = c;
- return this;
- }
+ /**
+ * Inner class that describes a NGram
+ */
+ class NGramEntry implements Comparable<NGramEntry> {
+
+ /** The NGRamProfile this NGram is related to */
+ private NGramProfile profile = null;
+
+ /** The sequence of characters of the ngram */
+ CharSequence seq = null;
+
+ /** The number of occurences of this ngram in its profile */
+ private int count = 0;
+
+ /** The frequency of this ngram in its profile */
+ private float frequency = 0.0F;
+
+ /**
+ * Constructs a new NGramEntry
+ * @param seq is the sequence of characters of the ngram
+ */
+ public NGramEntry(CharSequence seq) {
+ this.seq = seq;
+ }
+
+ /**
+ * Constructs a new NGramEntry
+ * @param seq is the sequence of characters of the ngram
+ * @param count is the number of occurences of this ngram
+ */
+ public NGramEntry(String seq, int count) {
+ this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+ this.count = count;
+ }
+
+ /**
+ * Returns the number of occurences of this ngram in its profile
+ * @return the number of occurences of this ngram in its profile
+ */
+ public int getCount() {
+ return count;
+ }
+
+ /**
+ * Returns the frequency of this ngram in its profile
+ * @return the frequency of this ngram in its profile
+ */
+ public float getFrequency() {
+ return frequency;
+ }
+
+ /**
+ * Returns the sequence of characters of this ngram
+ * @return the sequence of characters of this ngram
+ */
+ public CharSequence getSeq() {
+ return seq;
+ }
+
+ /**
+ * Returns the size of this ngram
+ * @return the size of this ngram
+ */
+ public int size() {
+ return seq.length();
+ }
+
+ // Inherited JavaDoc
+ public int compareTo(NGramEntry ngram) {
+ int diff = Float.compare(ngram.getFrequency(), frequency);
+ if (diff != 0) {
+ return diff;
+ } else {
+ return (toString().compareTo(ngram.toString()));
+ }
+ }
+
+ /**
+ * Increments the number of occurences of this ngram.
+ */
+ public void inc() {
+ count++;
+ }
+
+ /**
+ * Associated a profile to this ngram
+ * @param profile is the profile associated to this ngram
+ */
+ public void setProfile(NGramProfile profile) {
+ this.profile = profile;
+ }
+
+ /**
+ * Returns the profile associated to this ngram
+ * @return the profile associated to this ngram
+ */
+ public NGramProfile getProfile() {
+ return profile;
+ }
+
+ // Inherited JavaDoc
+ public String toString() {
+ return seq.toString();
+ }
+
+ // Inherited JavaDoc
+ public int hashCode() {
+ return seq.hashCode();
+ }
+
+ // Inherited JavaDoc
+ public boolean equals(Object obj) {
+
+ NGramEntry ngram = null;
+ try {
+ ngram = (NGramEntry) obj;
+ return ngram.seq.equals(seq);
+ } catch (Exception e) {
+ return false;
+ }
+ }
- public CharSequence subSequence(int start, int end) {
- return new String(value, start, end - start);
}
-
- public String toString() {
- return new String(this.value);
+
+ private class QuickStringBuffer implements CharSequence {
+
+ private char value[];
+
+ private int count;
+
+ QuickStringBuffer() {
+ this(16);
+ }
+
+ QuickStringBuffer(char[] value) {
+ this.value = value;
+ count = value.length;
+ }
+
+ QuickStringBuffer(int length) {
+ value = new char[length];
+ }
+
+ QuickStringBuffer(String str) {
+ this(str.length() + 16);
+ append(str);
+ }
+
+ public int length() {
+ return count;
+ }
+
+ private void expandCapacity(int minimumCapacity) {
+ int newCapacity = (value.length + 1) * 2;
+ if (newCapacity < 0) {
+ newCapacity = Integer.MAX_VALUE;
+ } else if (minimumCapacity > newCapacity) {
+ newCapacity = minimumCapacity;
+ }
+
+ char newValue[] = new char[newCapacity];
+ System.arraycopy(value, 0, newValue, 0, count);
+ value = newValue;
+ }
+
+ QuickStringBuffer clear() {
+ count = 0;
+ return this;
+ }
+
+ public char charAt(int index) {
+ return value[index];
+ }
+
+ QuickStringBuffer append(String str) {
+ if (str == null) {
+ str = String.valueOf(str);
+ }
+
+ int len = str.length();
+ int newcount = count + len;
+ if (newcount > value.length) {
+ expandCapacity(newcount);
+ }
+ str.getChars(0, len, value, count);
+ count = newcount;
+ return this;
+ }
+
+ QuickStringBuffer append(char c) {
+ int newcount = count + 1;
+ if (newcount > value.length) {
+ expandCapacity(newcount);
+ }
+ value[count++] = c;
+ return this;
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ return new String(value, start, end - start);
+ }
+
+ public String toString() {
+ return new String(this.value);
+ }
}
- }
-
-
+
}
Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java Fri Jul 31 07:43:34 2009
@@ -16,22 +16,20 @@
*/
package org.apache.tika.language;
-// JDK imports
-import java.io.InputStream;
import java.io.BufferedReader;
-import java.io.InputStreamReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.util.List;
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
-// JUnit imports
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
+import org.apache.tika.language.NGramProfile.NGramEntry;
+
/**
* JUnit based test of class {@link LanguageIdentifier}.
*
@@ -39,8 +37,7 @@
* @author Jerome Charron - http://frutch.free.fr/
*/
public class TestLanguageIdentifier extends TestCase {
-
-
+
public TestLanguageIdentifier(String testName) {
super(testName);
}
@@ -48,144 +45,141 @@
public static Test suite() {
return new TestSuite(TestLanguageIdentifier.class);
}
-
+
public static void main(String[] args) {
TestRunner.run(suite());
}
- String tokencontent1 = "testaddtoken";
- String tokencontent2 = "anotherteststring";
+ String tokencontent1 = "testaddtoken";
+ String tokencontent2 = "anotherteststring";
+
+ int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
- int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
+ String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
- String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
+ /**
+ * Test analyze method
+ */
+ public void testAnalyze() {
+ String tokencontent = "testmeagain";
- /**
- * Test analyze method
- */
- public void testAnalyze() {
- String tokencontent = "testmeagain";
+ NGramProfile p = new NGramProfile("test", 1, 1);
+ p.analyze(new StringBuilder(tokencontent));
- NGramProfile p = new NGramProfile("test", 1, 1);
- p.analyze(new StringBuilder(tokencontent));
+ //test that profile size is ok, eg 9 different NGramEntries "tesmagin"
+ assertEquals(8, p.getSorted().size());
+ }
- //test that profile size is ok, eg 9 different NGramEntries "tesmagin"
- assertEquals(8, p.getSorted().size());
- }
+ /**
+ * Test addNGrams method with StringBuffer argument
+ *
+ */
+ public void testAddNGramsStringBuffer() {
+ String tokencontent = "testmeagain";
- /**
- * Test addNGrams method with StringBuffer argument
- *
- */
- public void testAddNGramsStringBuffer() {
- String tokencontent = "testmeagain";
+ NGramProfile p = new NGramProfile("test", 1, 1);
+ p.add(new StringBuffer(tokencontent));
- NGramProfile p = new NGramProfile("test", 1, 1);
- p.add(new StringBuffer(tokencontent));
+ //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
+ assertEquals(8, p.getSorted().size());
- //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
- assertEquals(8, p.getSorted().size());
+ }
- }
+ /**
+ * test getSorted method
+ */
+ public void testGetSorted() {
+ int[] count = { 4, 3, 1 };
+ String[] ngram = { "a", "b", "c" };
- /**
- * test getSorted method
- */
- public void testGetSorted() {
- int[] count = { 4, 3, 1 };
- String[] ngram = { "a", "b", "c" };
+ String teststring = "AAaaBbbC";
- String teststring = "AAaaBbbC";
+ NGramProfile p = new NGramProfile("test", 1, 1);
+ p.analyze(new StringBuilder(teststring));
- NGramProfile p = new NGramProfile("test", 1, 1);
- p.analyze(new StringBuilder(teststring));
+ //test size of profile
+ assertEquals(3, p.getSorted().size());
- //test size of profile
- assertEquals(3, p.getSorted().size());
+ testCounts(p.getSorted(), count);
+ testContents(p.getSorted(), ngram);
+ }
- testCounts(p.getSorted(), count);
- testContents(p.getSorted(), ngram);
+ public void testGetSimilarity() {
+ NGramProfile a = new NGramProfile("a", 1, 1);
+ NGramProfile b = new NGramProfile("b", 1, 1);
- }
+ a.analyze(new StringBuilder(tokencontent1));
+ b.analyze(new StringBuilder(tokencontent2));
- public void testGetSimilarity() {
- NGramProfile a = new NGramProfile("a", 1, 1);
- NGramProfile b = new NGramProfile("b", 1, 1);
-
- a.analyze(new StringBuilder(tokencontent1));
- b.analyze(new StringBuilder(tokencontent2));
+ //because of rounding errors might slightly return different results
+ assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
+ }
- //because of rounding errors might slightly return different results
- assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
+ public void testExactMatch() {
+ NGramProfile a = new NGramProfile("a", 1, 1);
- }
+ a.analyze(new StringBuilder(tokencontent1));
- public void testExactMatch() {
- NGramProfile a = new NGramProfile("a", 1, 1);
-
- a.analyze(new StringBuilder(tokencontent1));
+ assertEquals(a.getSimilarity(a), 0, 0);
+ }
- assertEquals(a.getSimilarity(a), 0, 0);
- }
+ public void testIO() {
+ //Create profile and set some contents
+ NGramProfile a = new NGramProfile("a", 1, 1);
+ a.analyze(new StringBuilder(this.tokencontent1));
-
- public void testIO() {
- //Create profile and set some contents
- NGramProfile a = new NGramProfile("a", 1, 1);
- a.analyze(new StringBuilder(this.tokencontent1));
+ NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
- NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
+ //save profile
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
- //save profile
- ByteArrayOutputStream os = new ByteArrayOutputStream();
+ try {
+ a.save(os);
+ os.close();
+ } catch (Exception e) {
+ fail();
+ }
- try {
- a.save(os);
- os.close();
- } catch (Exception e) {
- fail();
- }
+ //load profile
+ InputStream is = new ByteArrayInputStream(os.toByteArray());
+ try {
+ b.load(is);
+ is.close();
+ } catch (Exception e) {
+ fail();
+ }
- //load profile
- InputStream is = new ByteArrayInputStream(os.toByteArray());
- try {
- b.load(is);
- is.close();
- } catch (Exception e) {
- fail();
+ //check it
+ testCounts(b.getSorted(), counts1);
+ testContents(b.getSorted(), chars1);
}
- //check it
- testCounts(b.getSorted(), counts1);
- testContents(b.getSorted(), chars1);
- }
+ private void testContents(List<NGramEntry> entries, String contents[]) {
+ int c = 0;
- private void testContents(List<NGramEntry> entries, String contents[]) {
- int c = 0;
-
- for (NGramEntry nge : entries) {
- assertEquals(contents[c], nge.getSeq().toString());
- c++;
+ for (NGramEntry nge : entries) {
+ assertEquals(contents[c], nge.getSeq().toString());
+ c++;
+ }
}
- }
- private void testCounts(List<NGramEntry> entries, int counts[]) {
- int c = 0;
+ private void testCounts(List<NGramEntry> entries, int counts[]) {
+ int c = 0;
- for (NGramEntry nge : entries) {
- System.out.println(nge);
- assertEquals(counts[c], nge.getCount());
- c++;
+ for (NGramEntry nge : entries) {
+ System.out.println(nge);
+ assertEquals(counts[c], nge.getCount());
+ c++;
+ }
}
- }
public void testIdentify() {
try {
long total = 0;
LanguageIdentifier idfr = new LanguageIdentifier();
BufferedReader in = new BufferedReader(new InputStreamReader(
- this.getClass().getResourceAsStream("test-referencial.txt")));
+ this.getClass().getResourceAsStream("test-referencial.txt")));
String line = null;
while((line = in.readLine()) != null) {
String[] tokens = line.split(";");
@@ -198,7 +192,7 @@
// Then, each line of the file...
BufferedReader testFile = new BufferedReader(
new InputStreamReader(
- this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
+ this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
String testLine = null;
while((testLine = testFile.readLine()) != null) {
testLine = testLine.trim();
Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java?rev=799534&r1=799533&r2=799534&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java Fri Jul 31 07:43:34 2009
@@ -21,121 +21,115 @@
import java.io.InputStream;
import java.util.List;
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
import junit.framework.TestCase;
-public class TestNGramProfile extends TestCase {
-
- String tokencontent1 = "testaddtoken";
- String tokencontent2 = "anotherteststring";
-
- int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
-
- String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
-
-
- /**
- * Test analyze method
- */
- public void testAnalyze() {
- String tokencontent = "testmeagain";
-
- NGramProfile p = new NGramProfile("test", 1, 1);
- p.analyze(new StringBuilder(tokencontent));
-
- //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
- assertEquals(8, p.getSorted().size());
- }
-
- /**
- * test getSorted method
- */
- public void testGetSorted() {
- int[] count = { 4, 3, 1 };
- String[] ngram = { "a", "b", "c" };
-
- String teststring = "AAaaBbbC";
+import org.apache.tika.language.NGramProfile.NGramEntry;
- NGramProfile p = new NGramProfile("test", 1, 1);
- p.analyze(new StringBuilder(teststring));
+public class TestNGramProfile extends TestCase {
- //test size of profile
- assertEquals(3, p.getSorted().size());
+ String tokencontent1 = "testaddtoken";
+ String tokencontent2 = "anotherteststring";
- testCounts(p.getSorted(), count);
- testContents(p.getSorted(), ngram);
+ int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 };
- }
+ String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" };
- public void testGetSimilarity() {
- NGramProfile a = new NGramProfile("a", 1, 1);
- NGramProfile b = new NGramProfile("b", 1, 1);
-
- a.analyze(new StringBuilder(tokencontent1));
- b.analyze(new StringBuilder(tokencontent2));
+ /**
+ * Test analyze method
+ */
+ public void testAnalyze() {
+ String tokencontent = "testmeagain";
- //because of rounding errors might slightly return different results
- assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
+ NGramProfile p = new NGramProfile("test", 1, 1);
+ p.analyze(new StringBuilder(tokencontent));
- }
+ //test that profile size is ok, eg 8 different NGramEntries "tesmagin"
+ assertEquals(8, p.getSorted().size());
+ }
- public void testExactMatch() {
- NGramProfile a = new NGramProfile("a", 1, 1);
-
- a.analyze(new StringBuilder(tokencontent1));
+ /**
+ * test getSorted method
+ */
+ public void testGetSorted() {
+ int[] count = { 4, 3, 1 };
+ String[] ngram = { "a", "b", "c" };
+
+ String teststring = "AAaaBbbC";
- assertEquals(a.getSimilarity(a), 0, 0);
+ NGramProfile p = new NGramProfile("test", 1, 1);
+ p.analyze(new StringBuilder(teststring));
- }
+ //test size of profile
+ assertEquals(3, p.getSorted().size());
-
- public void testIO() {
- //Create profile and set some contents
- NGramProfile a = new NGramProfile("a", 1, 1);
- a.analyze(new StringBuilder(this.tokencontent1));
+ testCounts(p.getSorted(), count);
+ testContents(p.getSorted(), ngram);
+ }
- NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
+ public void testGetSimilarity() {
+ NGramProfile a = new NGramProfile("a", 1, 1);
+ NGramProfile b = new NGramProfile("b", 1, 1);
- //save profile
- ByteArrayOutputStream os = new ByteArrayOutputStream();
+ a.analyze(new StringBuilder(tokencontent1));
+ b.analyze(new StringBuilder(tokencontent2));
- try {
- a.save(os);
- os.close();
- } catch (Exception e) {
- fail();
+ //because of rounding errors might slightly return different results
+ assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002);
}
- //load profile
- InputStream is = new ByteArrayInputStream(os.toByteArray());
- try {
- b.load(is);
- is.close();
- } catch (Exception e) {
- fail();
+ public void testExactMatch() {
+ NGramProfile a = new NGramProfile("a", 1, 1);
+ a.analyze(new StringBuilder(tokencontent1));
+ assertEquals(a.getSimilarity(a), 0, 0);
}
- //check it
- testCounts(b.getSorted(), counts1);
- testContents(b.getSorted(), chars1);
- }
+ public void testIO() {
+ //Create profile and set some contents
+ NGramProfile a = new NGramProfile("a", 1, 1);
+ a.analyze(new StringBuilder(this.tokencontent1));
+
+ NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1);
+
+ //save profile
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+
+ try {
+ a.save(os);
+ os.close();
+ } catch (Exception e) {
+ fail();
+ }
+
+ //load profile
+ InputStream is = new ByteArrayInputStream(os.toByteArray());
+ try {
+ b.load(is);
+ is.close();
+ } catch (Exception e) {
+ fail();
+ }
+
+ //check it
+ testCounts(b.getSorted(), counts1);
+ testContents(b.getSorted(), chars1);
+ }
- private void testContents(List<NGramEntry> entries, String contents[]) {
- int c = 0;
+ private void testContents(List<NGramEntry> entries, String contents[]) {
+ int c = 0;
- for (NGramEntry nge : entries) {
- assertEquals(contents[c], nge.getSeq().toString());
- c++;
+ for (NGramEntry nge : entries) {
+ assertEquals(contents[c], nge.getSeq().toString());
+ c++;
+ }
}
- }
- private void testCounts(List<NGramEntry> entries, int counts[]) {
- int c = 0;
+ private void testCounts(List<NGramEntry> entries, int counts[]) {
+ int c = 0;
- for (NGramEntry nge : entries) {
- assertEquals(counts[c], nge.getCount());
- c++;
+ for (NGramEntry nge : entries) {
+ assertEquals(counts[c], nge.getCount());
+ c++;
+ }
}
- }
+
}