You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/01 06:19:37 UTC
svn commit: r1029556 - in /tika/trunk: ./
tika-core/src/main/java/org/apache/tika/language/
tika-core/src/main/resources/org/apache/tika/language/
tika-core/src/test/java/org/apache/tika/language/
Author: mattmann
Date: Mon Nov 1 05:19:37 2010
New Revision: 1029556
URL: http://svn.apache.org/viewvc?rev=1029556&view=rev
Log:
- fix for TIKA-490 Support for adding language profiles dynamically
Added:
tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1029556&r1=1029555&r2=1029556&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Nov 1 05:19:37 2010
@@ -4,6 +4,9 @@ Release 0.8 - Current Development
The most notable changes in Tika 0.8 over previous releases are:
+ * Language identification is now dynamically configurable,
+ managed via a config file loaded from the classpath. (TIKA-490)
+
* Tika now supports parsing Feeds by wrapping the underlying
Rome library. (TIKA-466)
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=1029556&r1=1029555&r2=1029556&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Mon Nov 1 05:19:37 2010
@@ -17,10 +17,13 @@
package org.apache.tika.language;
import java.io.BufferedReader;
+import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
/**
* Identifier of the language that best matches a given content profile.
@@ -30,26 +33,49 @@ import java.util.Map;
* @since Apache Tika 0.5
* @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/">
* Europarl: A Parallel Corpus for Statistical Machine Translation</a>
- * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">
+ * @see <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php">
* ISO 639 Language Codes</a>
*/
public class LanguageIdentifier {
-
+
/**
* The available language profiles.
*/
private static final Map<String, LanguageProfile> PROFILES =
new HashMap<String, LanguageProfile>();
+ private static final String PROFILE_SUFFIX = ".ngp";
+ private static final String PROFILE_ENCODING = "UTF-8";
+
+ private static Properties props = new Properties();
+ private static String errors = "";
+
+ private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties";
+ private static final String PROPERTIES_FILE = "tika.language.properties";
+ private static final String LANGUAGES_KEY = "languages";
+
+ private final String language;
- private static void addProfile(String language) {
+ private final double distance;
+
+ /*
+ * Always attempt initializing language profiles when class is loaded first time
+ */
+ static {
+ initProfiles();
+ }
+
+ /*
+ * Add one language profile based on config in property file
+ */
+ private static void addProfile(String language) throws Exception {
try {
LanguageProfile profile = new LanguageProfile();
InputStream stream =
- LanguageIdentifier.class.getResourceAsStream(language + ".ngp");
+ LanguageIdentifier.class.getResourceAsStream(language + PROFILE_SUFFIX);
try {
BufferedReader reader =
- new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+ new BufferedReader(new InputStreamReader(stream, PROFILE_ENCODING));
String line = reader.readLine();
while (line != null) {
if (line.length() > 0 && !line.startsWith("#")) {
@@ -64,37 +90,25 @@ public class LanguageIdentifier {
stream.close();
}
- PROFILES.put(language, profile);
+ addProfile(language, profile);
} catch (Throwable t) {
- // Failed to load this language profile. Log the problem?
+ throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage());
}
}
-
- static {
- addProfile("da"); // Danish
- addProfile("de"); // German
- addProfile("et"); // Estonian
- addProfile("el"); // Greek
- addProfile("en"); // English
- addProfile("es"); // Spanish
- addProfile("fi"); // Finnish
- addProfile("fr"); // French
- addProfile("hu"); // Hungarian
- addProfile("is"); // Icelandic
- addProfile("it"); // Italian
- addProfile("nl"); // Dutch
- addProfile("no"); // Norwegian
- addProfile("pl"); // Polish
- addProfile("pt"); // Portuguese
- addProfile("ru"); // Russian
- addProfile("sv"); // Swedish
- addProfile("th"); // Thai
+
+ /**
+ * Adds a single language profile
+ * @param language an ISO 639 code representing language
+ * @param profile
+ */
+ public static void addProfile(String language, LanguageProfile profile) {
+ PROFILES.put(language, profile);
}
-
- private final String language;
-
- private final double distance;
-
+
+ /**
+ * Constructs a language identifier based on a LanguageProfile
+ * @param profile
+ */
public LanguageIdentifier(LanguageProfile profile) {
String minLanguage = "unknown";
double minDistance = 1.0;
@@ -110,18 +124,111 @@ public class LanguageIdentifier {
this.distance = minDistance;
}
+ /**
+ * Constructs a language identifier based on a String of text content
+ * @param content
+ */
public LanguageIdentifier(String content) {
this(new LanguageProfile(content));
}
+ /**
+ * Gets the identified language
+ * @return an ISO 639 code representing the detected language
+ */
public String getLanguage() {
return language;
}
+ /**
+ * Tries to judge whether the identification is certain enough
+ * to be trusted.
+ * WARNING: Will never return true for small amount of input texts.
+ * @return
+ */
public boolean isReasonablyCertain() {
return distance < 0.022;
}
+ /**
+ * Builds the language profiles.
+ * The list of languages are fetched from a property file named "tika.language.properties"
+ * If a file called "tika.language.override.properties" is found on classpath, this is used instead
+ * The property file contains a key "languages" with values being comma-separated language codes
+ */
+ public static void initProfiles() {
+ clearProfiles();
+
+ errors = "";
+ InputStream stream;
+ stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
+ if(stream == null)
+ stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE);
+
+ if(stream != null){
+ try {
+ props = new Properties();
+ props.load(stream);
+ } catch (IOException e) {
+ errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n";
+ }
+ }
+
+ String[] languages = props.getProperty(LANGUAGES_KEY).split(",");
+ for(String language : languages) {
+ language = language.trim();
+ String name = props.getProperty("name."+language, "Unknown");
+ try {
+ addProfile(language);
+ } catch (Exception e) {
+ errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n";
+ }
+ }
+ }
+
+ /**
+ * Initializes the language profiles from a user supplied initilized Map
+ * This overrides the default set of profiles initialized at startup,
+ * and provides an alternative to configuring profiles through property file
+ */
+ public static void initProfiles(Map<String, LanguageProfile> profilesMap) {
+ clearProfiles();
+ for(Map.Entry<String, LanguageProfile> entry : profilesMap.entrySet()) {
+ addProfile(entry.getKey(), entry.getValue());
+ }
+ }
+
+ /**
+ * Clears the current map of language profiles
+ */
+ public static void clearProfiles() {
+ PROFILES.clear();
+ }
+
+ /**
+ * Tests whether there were errors initializing language config
+ * @return true if there are errors. Use getErrors() to retrieve.
+ */
+ public static boolean hasErrors() {
+ return errors != "";
+ }
+
+ /**
+ * Returns a string of error messages related to initializing langauge profiles
+ * @return
+ */
+ public static String getErrors() {
+ return errors;
+ }
+
+ /**
+ * Returns what languages are supported for language identification
+ * @return A set of Strings being the ISO 639 language codes
+ */
+ public static Set<String> getSupportedLanguages() {
+ return PROFILES.keySet();
+ }
+
@Override
public String toString() {
return language + " (" + distance + ")";
Added: tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties?rev=1029556&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties (added)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties Mon Nov 1 05:19:37 2010
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This is a tika LanguageIdentifier properties file.
+# Its name is org/apache/tika/language/tika.language.properties
+# You can override it by placing a copy on the classpath in a file called
+# org/apache/tika/language/tika.language.override.properties
+
+# List of languages for which there are <language>.ngp profiles
+# If there exists an ISO 639-1 2-letter code it should be used
+# If not, you can choose an ISO 639-2 3-letter code
+# See http://www.loc.gov/standards/iso639-2/php/code_list.php
+languages=da,de,et,el,en,es,fi,fr,hu,is,it,nl,no,pl,pt,ru,sv,th
+
+# List of language names in english
+name.da=Danish
+name.de=German
+name.et=Estonian
+name.el=Greek
+name.en=English
+name.es=Spanish
+name.fi=Finnish
+name.fr=French
+name.hu=Hungarian
+name.is=Icelandic
+name.it=Italian
+name.nl=Dutch
+name.no=Norwegian
+name.pl=Polish
+name.pt=Portuguese
+name.ru=Russian
+name.sv=Swedish
+name.th=Thai
\ No newline at end of file
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java?rev=1029556&r1=1029555&r2=1029556&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java Mon Nov 1 05:19:37 2010
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
+import java.util.HashMap;
import junit.framework.TestCase;
@@ -39,26 +40,69 @@ public class LanguageIdentifierTest exte
"da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it", "nl", "pt", "sv"
};
+ public void setUp() {
+ LanguageIdentifier.initProfiles();
+ }
+
public void testLanguageDetection() throws IOException {
for (String language : languages) {
ProfilingWriter writer = new ProfilingWriter();
writeTo(language, writer);
- LanguageIdentifier identifier =
- new LanguageIdentifier(writer.getProfile());
+ LanguageIdentifier identifier = null;
+ identifier = new LanguageIdentifier(writer.getProfile());
assertTrue(identifier.toString(), identifier.isReasonablyCertain());
assertEquals(language, identifier.getLanguage());
}
}
- public void testMixedLanguages() throws IOException {
+ public void testClearAddAndInitProfiles() throws IOException {
+ // Prepare english and german language profiles
+ ProfilingWriter enWriter = new ProfilingWriter();
+ writeTo("en", enWriter);
+ LanguageProfile enProfile = enWriter.getProfile();
+ ProfilingWriter deWriter = new ProfilingWriter();
+ writeTo("de", deWriter);
+ LanguageProfile deProfile = deWriter.getProfile();
+
+ // Out of the box profiles
+ LanguageIdentifier identifier = null;
+ identifier = new LanguageIdentifier(enProfile);
+ assertEquals("en", identifier.getLanguage());
+ assertTrue(identifier.isReasonablyCertain());
+
+ // No profiles
+ LanguageIdentifier.clearProfiles();
+ identifier = new LanguageIdentifier(enProfile);
+ assertFalse(identifier.isReasonablyCertain());
+
+ // Only English profile
+ LanguageIdentifier.addProfile("en", enProfile);
+ identifier = new LanguageIdentifier(enProfile);
+ assertEquals("en", identifier.getLanguage());
+ assertTrue(identifier.isReasonablyCertain());
+
+ // English and German profiles loaded explicitly from initProfiles method
+ HashMap<String, LanguageProfile> profilesMap = new HashMap<String, LanguageProfile>();
+ profilesMap.put("en", enProfile);
+ profilesMap.put("de", deProfile);
+ LanguageIdentifier.initProfiles(profilesMap);
+ identifier = new LanguageIdentifier(enProfile);
+ assertEquals("en", identifier.getLanguage());
+ assertTrue(identifier.isReasonablyCertain());
+ identifier = new LanguageIdentifier(deProfile);
+ assertEquals("de", identifier.getLanguage());
+ assertTrue(identifier.isReasonablyCertain());
+ }
+
+ public void testMixedLanguages() throws IOException {
for (String language : languages) {
for (String other : languages) {
if (!language.equals(other)) {
ProfilingWriter writer = new ProfilingWriter();
writeTo(language, writer);
writeTo(other, writer);
- LanguageIdentifier identifier =
- new LanguageIdentifier(writer.getProfile());
+ LanguageIdentifier identifier = null;
+ identifier = new LanguageIdentifier(writer.getProfile());
assertFalse(identifier.isReasonablyCertain());
}
}