You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/01 06:19:37 UTC

svn commit: r1029556 - in /tika/trunk: ./ tika-core/src/main/java/org/apache/tika/language/ tika-core/src/main/resources/org/apache/tika/language/ tika-core/src/test/java/org/apache/tika/language/

Author: mattmann
Date: Mon Nov  1 05:19:37 2010
New Revision: 1029556

URL: http://svn.apache.org/viewvc?rev=1029556&view=rev
Log:
- fix for TIKA-490 Support for adding language profiles dynamically

Added:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1029556&r1=1029555&r2=1029556&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Nov  1 05:19:37 2010
@@ -4,6 +4,9 @@ Release 0.8 - Current Development
 
 The most notable changes in Tika 0.8 over previous releases are:
 
+ * Language identification is now dynamically configurable, 
+   managed via a config file loaded from the classpath. (TIKA-490)
+
  * Tika now supports parsing Feeds by wrapping the underlying
    Rome library. (TIKA-466)
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=1029556&r1=1029555&r2=1029556&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Mon Nov  1 05:19:37 2010
@@ -17,10 +17,13 @@
 package org.apache.tika.language;
 
 import java.io.BufferedReader;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
 
 /**
  * Identifier of the language that best matches a given content profile.
@@ -30,26 +33,49 @@ import java.util.Map;
  * @since Apache Tika 0.5
  * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/">
  *      Europarl: A Parallel Corpus for Statistical Machine Translation</a>
- * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">
+ * @see <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php">
  *      ISO 639 Language Codes</a>
  */
 public class LanguageIdentifier {
-
+    
     /**
      * The available language profiles.
      */
     private static final Map<String, LanguageProfile> PROFILES =
         new HashMap<String, LanguageProfile>();
+    private static final String PROFILE_SUFFIX = ".ngp";
+    private static final String PROFILE_ENCODING = "UTF-8";
+
+    private static Properties props = new Properties();
+    private static String errors = "";
+    
+    private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties";
+    private static final String PROPERTIES_FILE = "tika.language.properties";
+    private static final String LANGUAGES_KEY = "languages";
+
+    private final String language;
 
-    private static void addProfile(String language) {
+    private final double distance;
+
+    /*
+     * Always attempt initializing language profiles when class is loaded first time
+     */
+    static {
+        initProfiles();
+    }
+    
+    /*
+     * Add one language profile based on config in property file
+     */
+    private static void addProfile(String language) throws Exception {
         try {
             LanguageProfile profile = new LanguageProfile();
 
             InputStream stream =
-                LanguageIdentifier.class.getResourceAsStream(language + ".ngp");
+                LanguageIdentifier.class.getResourceAsStream(language + PROFILE_SUFFIX);
             try {
                 BufferedReader reader =
-                    new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+                    new BufferedReader(new InputStreamReader(stream, PROFILE_ENCODING));
                 String line = reader.readLine();
                 while (line != null) {
                     if (line.length() > 0 && !line.startsWith("#")) {
@@ -64,37 +90,25 @@ public class LanguageIdentifier {
                 stream.close();
             }
 
-            PROFILES.put(language, profile);
+            addProfile(language, profile);
         } catch (Throwable t) {
-            // Failed to load this language profile. Log the problem?
+            throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage());
         }
     }
-
-    static {
-        addProfile("da"); // Danish
-        addProfile("de"); // German
-        addProfile("et"); // Estonian
-        addProfile("el"); // Greek
-        addProfile("en"); // English
-        addProfile("es"); // Spanish
-        addProfile("fi"); // Finnish
-        addProfile("fr"); // French
-        addProfile("hu"); // Hungarian
-        addProfile("is"); // Icelandic
-        addProfile("it"); // Italian
-        addProfile("nl"); // Dutch
-        addProfile("no"); // Norwegian
-        addProfile("pl"); // Polish
-        addProfile("pt"); // Portuguese
-        addProfile("ru"); // Russian
-        addProfile("sv"); // Swedish
-        addProfile("th"); // Thai
+    
+    /**
+     * Adds a single language profile
+     * @param language an ISO 639 code representing language
+     * @param profile
+     */
+    public static void addProfile(String language, LanguageProfile profile) {
+        PROFILES.put(language, profile);
     }
-
-    private final String language;
-
-    private final double distance;
-
+    
+    /**
+     * Constructs a language identifier based on a LanguageProfile
+     * @param profile
+     */
     public LanguageIdentifier(LanguageProfile profile) {
         String minLanguage = "unknown";
         double minDistance = 1.0;
@@ -110,18 +124,111 @@ public class LanguageIdentifier {
         this.distance = minDistance;
     }
 
+    /**
+     * Constructs a language identifier based on a String of text content
+     * @param content
+     */
     public LanguageIdentifier(String content) {
         this(new LanguageProfile(content));
     }
 
+    /**
+     * Gets the identified language
+     * @return an ISO 639 code representing the detected language
+     */
     public String getLanguage() {
         return language;
     }
 
+    /**
+     * Tries to judge whether the identification is certain enough
+     * to be trusted.
+     * WARNING: Will never return true for small amount of input texts. 
+     * @return
+     */
     public boolean isReasonablyCertain() {
         return distance < 0.022;
     }
 
+    /**
+     * Builds the language profiles.
+     * The list of languages are fetched from a property file named "tika.language.properties"
+     * If a file called "tika.language.override.properties" is found on classpath, this is used instead
+     * The property file contains a key "languages" with values being comma-separated language codes
+     */
+    public static void initProfiles() {
+        clearProfiles();
+        
+        errors = "";
+        InputStream stream;
+        stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
+        if(stream == null) 
+            stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE);
+
+        if(stream != null){
+            try {
+                props = new Properties();
+                props.load(stream);
+            } catch (IOException e) {
+                errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n";
+            }
+        }
+        
+        String[] languages = props.getProperty(LANGUAGES_KEY).split(",");
+        for(String language : languages) {
+            language = language.trim();
+            String name = props.getProperty("name."+language, "Unknown");
+            try {
+                addProfile(language);
+            } catch (Exception e) {
+                errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n";
+            }
+        }
+    }
+
+    /**
+     * Initializes the language profiles from a user supplied initilized Map
+     * This overrides the default set of profiles initialized at startup,
+     * and provides an alternative to configuring profiles through property file
+     */
+    public static void initProfiles(Map<String, LanguageProfile> profilesMap) {
+        clearProfiles();
+        for(Map.Entry<String, LanguageProfile> entry : profilesMap.entrySet()) {
+            addProfile(entry.getKey(), entry.getValue());
+        }
+    }
+    
+    /**
+     * Clears the current map of language profiles
+     */
+    public static void clearProfiles() {
+        PROFILES.clear();
+    }
+    
+    /**
+     * Tests whether there were errors initializing language config
+     * @return true if there are errors. Use getErrors() to retrieve.
+     */
+    public static boolean hasErrors() {
+        return errors != "";
+    }
+    
+    /**
+     * Returns a string of error messages related to initializing langauge profiles
+     * @return
+     */
+    public static String getErrors() {
+        return errors;
+    }
+    
+    /**
+     * Returns what languages are supported for language identification
+     * @return A set of Strings being the ISO 639 language codes
+     */
+    public static Set<String> getSupportedLanguages() {
+        return PROFILES.keySet();
+    }
+
     @Override
     public String toString() {
         return language + " (" + distance + ")";

Added: tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties?rev=1029556&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties (added)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties Mon Nov  1 05:19:37 2010
@@ -0,0 +1,46 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+# 
+#      http://www.apache.org/licenses/LICENSE-2.0
+# 
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# This is a tika LanguageIdentifier properties file.
+# Its name is org/apache/tika/language/tika.language.properties
+# You can override it by placing a copy on the classpath in a file called
+# org/apache/tika/language/tika.language.override.properties
+
+# List of languages for which there are <language>.ngp profiles
+# If there exists an ISO 639-1 2-letter code it should be used
+# If not, you can choose an ISO 639-2 3-letter code
+# See http://www.loc.gov/standards/iso639-2/php/code_list.php
+languages=da,de,et,el,en,es,fi,fr,hu,is,it,nl,no,pl,pt,ru,sv,th
+
+# List of language names in english
+name.da=Danish
+name.de=German
+name.et=Estonian
+name.el=Greek
+name.en=English
+name.es=Spanish
+name.fi=Finnish
+name.fr=French
+name.hu=Hungarian
+name.is=Icelandic
+name.it=Italian
+name.nl=Dutch
+name.no=Norwegian
+name.pl=Polish
+name.pt=Portuguese
+name.ru=Russian
+name.sv=Swedish
+name.th=Thai
\ No newline at end of file

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java?rev=1029556&r1=1029555&r2=1029556&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java Mon Nov  1 05:19:37 2010
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Writer;
+import java.util.HashMap;
 
 import junit.framework.TestCase;
 
@@ -39,26 +40,69 @@ public class LanguageIdentifierTest exte
         "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it", "nl", "pt", "sv"
     };
 
+    public void setUp() {
+        LanguageIdentifier.initProfiles();
+    }
+    
     public void testLanguageDetection() throws IOException {
         for (String language : languages) {
             ProfilingWriter writer = new ProfilingWriter();
             writeTo(language, writer);
-            LanguageIdentifier identifier =
-                new LanguageIdentifier(writer.getProfile());
+            LanguageIdentifier identifier = null;
+            identifier = new LanguageIdentifier(writer.getProfile());
             assertTrue(identifier.toString(), identifier.isReasonablyCertain());
             assertEquals(language, identifier.getLanguage());
         }
     }
 
-    public void testMixedLanguages() throws IOException {
+    public void testClearAddAndInitProfiles() throws IOException {
+        // Prepare english and german language profiles
+        ProfilingWriter enWriter = new ProfilingWriter();
+        writeTo("en", enWriter);
+        LanguageProfile enProfile = enWriter.getProfile();
+        ProfilingWriter deWriter = new ProfilingWriter();
+        writeTo("de", deWriter);
+        LanguageProfile deProfile = deWriter.getProfile();
+
+        // Out of the box profiles
+        LanguageIdentifier identifier = null;
+        identifier = new LanguageIdentifier(enProfile);
+        assertEquals("en", identifier.getLanguage());
+        assertTrue(identifier.isReasonablyCertain());
+
+        // No profiles
+        LanguageIdentifier.clearProfiles();
+        identifier = new LanguageIdentifier(enProfile);
+        assertFalse(identifier.isReasonablyCertain());
+
+        // Only English profile
+        LanguageIdentifier.addProfile("en", enProfile);
+        identifier = new LanguageIdentifier(enProfile);
+        assertEquals("en", identifier.getLanguage());
+        assertTrue(identifier.isReasonablyCertain());
+
+        // English and German profiles loaded explicitly from initProfiles method
+        HashMap<String, LanguageProfile> profilesMap = new HashMap<String, LanguageProfile>();
+        profilesMap.put("en", enProfile);
+        profilesMap.put("de", deProfile);
+        LanguageIdentifier.initProfiles(profilesMap);
+        identifier = new LanguageIdentifier(enProfile);
+        assertEquals("en", identifier.getLanguage());
+        assertTrue(identifier.isReasonablyCertain());
+        identifier = new LanguageIdentifier(deProfile);
+        assertEquals("de", identifier.getLanguage());
+        assertTrue(identifier.isReasonablyCertain());
+  }
+
+  public void testMixedLanguages() throws IOException {
         for (String language : languages) {
             for (String other : languages) {
                 if (!language.equals(other)) {
                     ProfilingWriter writer = new ProfilingWriter();
                     writeTo(language, writer);
                     writeTo(other, writer);
-                    LanguageIdentifier identifier =
-                        new LanguageIdentifier(writer.getProfile());
+                    LanguageIdentifier identifier = null;
+                    identifier = new LanguageIdentifier(writer.getProfile());
                     assertFalse(identifier.isReasonablyCertain());
                 }
             }