You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/11/13 04:47:08 UTC
svn commit: r835724 - in
/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language:
LanguageIdentifier.java LanguageProfile.java ProfilingHandler.java
ProfilingWriter.java
Author: jukka
Date: Fri Nov 13 03:47:07 2009
New Revision: 835724
URL: http://svn.apache.org/viewvc?rev=835724&view=rev
Log:
TIKA-209: Language detection is weak.
Improved javadocs and some streamlining of the language identifier code.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Nov 13 03:47:07 2009
@@ -23,12 +23,13 @@
import java.util.Map;
/**
- * Identify the language of a content, based on statistical analysis.
- * Text document language identifier.
- * <p>
- * Language profiles are based on material from
- * <a href="http://www.isi.edu/~koehn/europarl/">http://www.isi.edu/~koehn/europarl/</a>.
+ * Identifier of the language that best matches a given content profile.
+ * The content profile is compared to generic language profiles based on
+ * material from various sources.
*
+ * @since Apache Tika 0.5
+ * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/">
+ * Europarl: A Parallel Corpus for Statistical Machine Translation</a>
* @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">
* ISO 639 Language Codes</a>
*/
@@ -110,7 +111,7 @@
}
public LanguageIdentifier(String content) {
- this(ProfilingWriter.profile(content));
+ this(new LanguageProfile(content));
}
public String getLanguage() {
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java Fri Nov 13 03:47:07 2009
@@ -23,9 +23,15 @@
/**
* Language profile based on ngram counts.
+ *
+ * @since Apache Tika 0.5
*/
public class LanguageProfile {
+ public static final int DEFAULT_NGRAM_LENGTH = 3;
+
+ private final int length;
+
/**
* The ngrams that make up this profile.
*/
@@ -45,6 +51,26 @@
}
}
+ public LanguageProfile(int length) {
+ this.length = length;
+ }
+
+ public LanguageProfile() {
+ this(DEFAULT_NGRAM_LENGTH);
+ }
+
+ public LanguageProfile(String content, int length) {
+ this(length);
+
+ ProfilingWriter writer = new ProfilingWriter(this);
+ char[] ch = content.toCharArray();
+ writer.write(ch, 0, ch.length);
+ }
+
+ public LanguageProfile(String content) {
+ this(content, DEFAULT_NGRAM_LENGTH);
+ }
+
public long getCount() {
return count;
}
@@ -74,6 +100,12 @@
* @param count number of occurrences to add
*/
public void add(String ngram, long count) {
+ if (length != ngram.length()) {
+ throw new IllegalArgumentException(
+ "Unable to add an ngram of incorrect length: "
+ + ngram.length() + " != " + length);
+ }
+
Counter counter = ngrams.get(ngram);
if (counter == null) {
counter = new Counter();
@@ -91,6 +123,13 @@
* @return distance between the profiles
*/
public double distance(LanguageProfile that) {
+ if (length != that.length) {
+ throw new IllegalArgumentException(
+ "Unable to calculage distance of language profiles"
+ + " with different ngram lengths: "
+ + that.length + " != " + length);
+ }
+
double sumOfSquares = 0.0;
double thisCount = Math.max(this.count, 1.0);
double thatCount = Math.max(that.count, 1.0);
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java Fri Nov 13 03:47:07 2009
@@ -16,51 +16,52 @@
*/
package org.apache.tika.language;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.SAXException;
+import org.apache.tika.sax.WriteOutContentHandler;
-public class ProfilingHandler extends BodyContentHandler {
-
- private static final long CHECK_INTERVAL = 1000;
-
- private final LanguageProfile profile;
-
- private final Metadata metadata;
+/**
+ * SAX content handler that builds a language profile based on all the
+ * received character content.
+ *
+ * @since Apache Tika 0.5
+ */
+public class ProfilingHandler extends WriteOutContentHandler {
- private long nextCheckCount = CHECK_INTERVAL;
+ private final ProfilingWriter writer;
- private ProfilingHandler(ProfilingWriter writer, Metadata metadata) {
+ public ProfilingHandler(ProfilingWriter writer) {
super(writer);
- this.profile = writer.getProfile();
- this.metadata = metadata;
+ this.writer = writer;
}
- public ProfilingHandler(Metadata metadata) {
- this(new ProfilingWriter(), metadata);
+ public ProfilingHandler(LanguageProfile profile) {
+ this(new ProfilingWriter(profile));
}
- private void checkAndSetLanguage() {
- LanguageIdentifier identifier = new LanguageIdentifier(profile);
- if (identifier.isReasonablyCertain()) {
- metadata.set(Metadata.LANGUAGE, identifier.getLanguage());
- }
+ public ProfilingHandler() {
+ this(new ProfilingWriter());
}
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- super.characters(ch, start, length);
- if (profile.getCount() > nextCheckCount) {
- checkAndSetLanguage();
- nextCheckCount = profile.getCount() + CHECK_INTERVAL;
- }
+ /**
+ * Returns the language profile being built by this content handler.
+ * Note that the returned profile gets updated whenever new SAX events
+ * are received by this content handler. Use the {@link #getLanguage()}
+ * method to get the language that best matches the current state of
+ * the profile.
+ *
+ * @return language profile
+ */
+ public LanguageProfile getProfile() {
+ return writer.getProfile();
}
- @Override
- public void endDocument() throws SAXException {
- super.endDocument();
- checkAndSetLanguage();
+ /**
+ * Returns the language that best matches the current state of the
+ * language profile.
+ *
+ * @return language that best matches the current profile
+ */
+ public LanguageIdentifier getLanguage() {
+ return writer.getLanguage();
}
}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java?rev=835724&r1=835723&r2=835724&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java Fri Nov 13 03:47:07 2009
@@ -19,26 +19,49 @@
import java.io.IOException;
import java.io.Writer;
+/**
+ * Writer that builds a language profile based on all the written content.
+ *
+ * @since Apache Tika 0.5
+ */
public class ProfilingWriter extends Writer {
- public static LanguageProfile profile(String content) {
- ProfilingWriter writer = new ProfilingWriter();
- char[] ch = content.toCharArray();
- writer.write(ch, 0, ch.length);
- return writer.getProfile();
- }
-
-
- private final LanguageProfile profile = new LanguageProfile();
+ private final LanguageProfile profile;
private char[] buffer = new char[] { 0, 0, '_' };
private int n = 1;
+ public ProfilingWriter(LanguageProfile profile) {
+ this.profile = profile;
+ }
+
+ public ProfilingWriter() {
+ this(new LanguageProfile());
+ }
+
+ /**
+ * Returns the language profile being built by this writer. Note that
+ * the returned profile gets updated whenever new characters are written.
+ * Use the {@link #getLanguage()} method to get the language that best
+ * matches the current state of the profile.
+ *
+ * @return language profile
+ */
public LanguageProfile getProfile() {
return profile;
}
+ /**
+ * Returns the language that best matches the current state of the
+ * language profile.
+ *
+ * @return language that best matches the current profile
+ */
+ public LanguageIdentifier getLanguage() {
+ return new LanguageIdentifier(profile);
+ }
+
@Override
public void write(char[] cbuf, int off, int len) {
for (int i = 0; i < len; i++) {