You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/11/13 04:47:08 UTC

svn commit: r835724 - in /lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language: LanguageIdentifier.java LanguageProfile.java ProfilingHandler.java ProfilingWriter.java

Author: jukka
Date: Fri Nov 13 03:47:07 2009
New Revision: 835724

URL: http://svn.apache.org/viewvc?rev=835724&view=rev
Log:
TIKA-209: Language detection is weak.

Improved javadocs and some streamlining of the language identifier code.

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Nov 13 03:47:07 2009
@@ -23,12 +23,13 @@
 import java.util.Map;
 
 /**
- * Identify the language of a content, based on statistical analysis.
- * Text document language identifier.
- * <p>
- * Language profiles are based on material from
- * <a href="http://www.isi.edu/~koehn/europarl/">http://www.isi.edu/~koehn/europarl/</a>.
+ * Identifier of the language that best matches a given content profile.
+ * The content profile is compared to generic language profiles based on
+ * material from various sources.
  *
+ * @since Apache Tika 0.5
+ * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/">
+ *      Europarl: A Parallel Corpus for Statistical Machine Translation</a>
  * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">
  *      ISO 639 Language Codes</a>
  */
@@ -110,7 +111,7 @@
     }
 
     public LanguageIdentifier(String content) {
-        this(ProfilingWriter.profile(content));
+        this(new LanguageProfile(content));
     }
 
     public String getLanguage() {

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java Fri Nov 13 03:47:07 2009
@@ -23,9 +23,15 @@
 
 /**
  * Language profile based on ngram counts.
+ *
+ * @since Apache Tika 0.5
  */
 public class LanguageProfile {
 
+    public static final int DEFAULT_NGRAM_LENGTH = 3;
+
+    private final int length;
+
     /**
      * The ngrams that make up this profile.
      */
@@ -45,6 +51,26 @@
         }
     }
 
+    public LanguageProfile(int length) {
+        this.length = length;
+    }
+
+    public LanguageProfile() {
+        this(DEFAULT_NGRAM_LENGTH);
+    }
+
+    public LanguageProfile(String content, int length) {
+        this(length);
+
+        ProfilingWriter writer = new ProfilingWriter(this);
+        char[] ch = content.toCharArray();
+        writer.write(ch, 0, ch.length);
+    }
+
+    public LanguageProfile(String content) {
+        this(content, DEFAULT_NGRAM_LENGTH);
+    }
+
     public long getCount() {
         return count;
     }
@@ -74,6 +100,12 @@
      * @param count number of occurrences to add
      */
     public void add(String ngram, long count) {
+        if (length != ngram.length()) {
+            throw new IllegalArgumentException(
+                    "Unable to add an ngram of incorrect length: "
+                    + ngram.length() + " != " + length);
+        }
+
         Counter counter = ngrams.get(ngram);
         if (counter == null) {
             counter = new Counter();
@@ -91,6 +123,13 @@
      * @return distance between the profiles
      */
     public double distance(LanguageProfile that) {
+        if (length != that.length) {
+            throw new IllegalArgumentException(
+                    "Unable to calculage distance of language profiles"
+                    + " with different ngram lengths: "
+                    + that.length + " != " + length);
+        }
+
         double sumOfSquares = 0.0;
         double thisCount = Math.max(this.count, 1.0);
         double thatCount = Math.max(that.count, 1.0);

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java Fri Nov 13 03:47:07 2009
@@ -16,51 +16,52 @@
  */
 package org.apache.tika.language;
 
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.SAXException;
+import org.apache.tika.sax.WriteOutContentHandler;
 
-public class ProfilingHandler extends BodyContentHandler {
-
-    private static final long CHECK_INTERVAL = 1000;
-
-    private final LanguageProfile profile;
-
-    private final Metadata metadata;
+/**
+ * SAX content handler that builds a language profile based on all the
+ * received character content.
+ *
+ * @since Apache Tika 0.5
+ */
+public class ProfilingHandler extends WriteOutContentHandler {
 
-    private long nextCheckCount = CHECK_INTERVAL;
+    private final ProfilingWriter writer;
 
-    private ProfilingHandler(ProfilingWriter writer, Metadata metadata) {
+    public ProfilingHandler(ProfilingWriter writer) {
         super(writer);
-        this.profile = writer.getProfile();
-        this.metadata = metadata;
+        this.writer = writer;
     }
 
-    public ProfilingHandler(Metadata metadata) {
-        this(new ProfilingWriter(), metadata);
+    public ProfilingHandler(LanguageProfile profile) {
+        this(new ProfilingWriter(profile));
     }
 
-    private void checkAndSetLanguage() {
-        LanguageIdentifier identifier = new LanguageIdentifier(profile);
-        if (identifier.isReasonablyCertain()) {
-            metadata.set(Metadata.LANGUAGE, identifier.getLanguage());
-        }
+    public ProfilingHandler() {
+        this(new ProfilingWriter());
     }
 
-    @Override
-    public void characters(char[] ch, int start, int length)
-            throws SAXException {
-        super.characters(ch, start, length);
-        if (profile.getCount() > nextCheckCount) {
-            checkAndSetLanguage();
-            nextCheckCount = profile.getCount() + CHECK_INTERVAL;
-        }
+    /**
+     * Returns the language profile being built by this content handler.
+     * Note that the returned profile gets updated whenever new SAX events
+     * are received by this content handler. Use the {@link #getLanguage()}
+     * method to get the language that best matches the current state of
+     * the profile.
+     *
+     * @return language profile
+     */
+    public LanguageProfile getProfile() {
+        return writer.getProfile();
     }
 
-    @Override
-    public void endDocument() throws SAXException {
-        super.endDocument();
-        checkAndSetLanguage();
+    /**
+     * Returns the language that best matches the current state of the
+     * language profile.
+     *
+     * @return language that best matches the current profile
+     */
+    public LanguageIdentifier getLanguage() {
+        return writer.getLanguage();
     }
 
 }

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java Fri Nov 13 03:47:07 2009
@@ -19,26 +19,49 @@
 import java.io.IOException;
 import java.io.Writer;
 
+/**
+ * Writer that builds a language profile based on all the written content.
+ *
+ * @since Apache Tika 0.5
+ */
 public class ProfilingWriter extends Writer {
 
-    public static LanguageProfile profile(String content) {
-        ProfilingWriter writer = new ProfilingWriter();
-        char[] ch = content.toCharArray();
-        writer.write(ch, 0, ch.length);
-        return writer.getProfile();
-    }
-
-
-    private final LanguageProfile profile = new LanguageProfile();
+    private final LanguageProfile profile;
 
     private char[] buffer = new char[] { 0, 0, '_' };
 
     private int n = 1;
 
+    public ProfilingWriter(LanguageProfile profile) {
+        this.profile = profile;
+    }
+
+    public ProfilingWriter() {
+        this(new LanguageProfile());
+    }
+
+    /**
+     * Returns the language profile being built by this writer. Note that
+     * the returned profile gets updated whenever new characters are written.
+     * Use the {@link #getLanguage()} method to get the language that best
+     * matches the current state of the profile.
+     *
+     * @return language profile
+     */
     public LanguageProfile getProfile() {
         return profile;
     }
 
+    /**
+     * Returns the language that best matches the current state of the
+     * language profile.
+     *
+     * @return language that best matches the current profile
+     */
+    public LanguageIdentifier getLanguage() {
+        return new LanguageIdentifier(profile);
+    }
+
     @Override
     public void write(char[] cbuf, int off, int len) {
         for (int i = 0; i < len; i++) {