You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2011/07/12 15:18:47 UTC

svn commit: r1145588 - in /incubator/stanbol/trunk/enhancer/engines/langid: ./ src/main/java/org/apache/stanbol/enhancer/engines/langid/ src/main/resources/ src/test/java/org/apache/stanbol/enhancer/engines/langid/core/

Author: wkasper
Date: Tue Jul 12 13:18:46 2011
New Revision: 1145588

URL: http://svn.apache.org/viewvc?rev=1145588&view=rev
Log:
Stanbol-61: Use Apache Tika language identifier instead of TextCat

Added:
    incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java
      - copied, changed from r1141894, incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java
Removed:
    incubator/stanbol/trunk/enhancer/engines/langid/src/main/resources/languageLabelsMap.txt
    incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java
Modified:
    incubator/stanbol/trunk/enhancer/engines/langid/README.md
    incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
    incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java

Modified: incubator/stanbol/trunk/enhancer/engines/langid/README.md
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/README.md?rev=1145588&r1=1145587&r2=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/README.md (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/README.md Tue Jul 12 13:18:46 2011
@@ -4,7 +4,7 @@ The **LangId** engine determines the lan
 
 ## Technical Description
 
-The provided engine is based on the [TextCat library](http://textcat.sourceforge.net/).
+The provided engine is based on the language identifier of [Apache Tika](http://tika.apache.org/).
 The text to be checked must be provided in plain text format in one of two forms:
 
 * a plain text content item
@@ -28,6 +28,7 @@ By default the language identifier disti
 
 * German: de
 * English: en
+* Estonian: et
 * French: fr
 * Spanish: es
 * Italian: it
@@ -36,13 +37,16 @@ By default the language identifier disti
 * Dutch: nl
 * Norwegian: no
 * Finnish: fi
-* Albanian: sq
-* Slovak (ASCII): sk
-* Slovenian (ASCII): sl
+* Greek: el
 * Danish: da
 * Hungarian: hu
+* Icelandic: is
+* Lithuanian: lt
+* Portuguese: pt
+* Russian: ru
+* Thai: th
 
-Tools for creating new or additional language models are provided by the underlying TextCat system as documented at [http://textcat.sourceforge.net](http://textcat.sourceforge.net/).
+Additional language models can be created as Tika [LanguageProfile](org.apache.tika.language.LanguageProfile).
 
 ## Configuration options
 
@@ -53,13 +57,6 @@ Tools for creating new or additional lan
     text. Otherwise only a substring of the specified length taken from the
     middle of the text will be used. The default value is 400 characters.
 
-* <pre><code>org.apache.stanbol.enhancer.engines.langid.model-configuration-file</pre></code>
-
-    the name of a file that defines which statistical language models are
-    used and the mappings from statistical model names to language labels
-    that will appear as the value in the enhancement structure. By default
-    the resource file *languageLabelsMap.txt* is used.
-
 ## Usage
 
 Assuming that the Stanbol endpoint with the full launcher is running at

Modified: incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/pom.xml?rev=1145588&r1=1145587&r2=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/pom.xml Tue Jul 12 13:18:46 2011
@@ -86,8 +86,8 @@
     </dependency>
 
     <dependency>
-      <groupId>net.sourceforge</groupId>
-      <artifactId>textcat</artifactId>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
     </dependency>
 
     <dependency>
@@ -114,19 +114,4 @@
     </dependency>
   </dependencies>
 
-  <repositories>
-    <repository>
-      <releases>
-        <enabled>true</enabled>
-        <updatePolicy>never</updatePolicy>
-      </releases>
-      <snapshots>
-        <enabled>false</enabled>
-      </snapshots>
-      <id>dfki-lt-repo</id>
-      <name>DFKI LT Repository</name>
-      <url>http://www.dfki.de/~kasper/IKS/maven</url>
-    </repository>
-  </repositories>
-
 </project>

Modified: incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java?rev=1145588&r1=1145587&r2=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java Tue Jul 12 13:18:46 2011
@@ -20,12 +20,10 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.Properties;
 
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Triple;
@@ -42,7 +40,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
-import org.knallgrau.utils.textcat.TextCategorizer;
+import org.apache.tika.language.LanguageIdentifier;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -64,13 +62,6 @@ public class LangIdEnhancementEngine imp
     @Property
     public static final String PROBE_LENGTH_PROP = "org.apache.stanbol.enhancer.engines.langid.probe-length";
 
-    /**
-     * this allows to specify the path to a configuration file that specifies
-     * the language models and how they map to language labels.
-     */
-    @Property
-    public static final String MODEL_CONFIGURATION_FILE_PROP
-            = "org.apache.stanbol.enhancer.engines.langid.model-configuration-file";
 
     /**
      * The default value for the Execution of this Engine. Currently set to
@@ -88,21 +79,12 @@ public class LangIdEnhancementEngine imp
      */
     private static final Logger log = LoggerFactory.getLogger(LangIdEnhancementEngine.class);
 
-    private static final String LANGUAGE_MAP_DEFAULT = "languageLabelsMap.txt";
-
-    private Properties languageLabelsMap = new Properties();
-
-    /**
-     * This contains the language identifier.
-     */
-    private TextCategorizer languageIdentifier;
-
-    private static final int PROBE_LENGTH_DEFAULT = 400;
+    private static final int PROBE_LENGTH_DEFAULT = 1000;
 
     /**
      * How much text should be used for testing: If the value is 0 or smaller,
      * the complete text will be used. Otherwise a text probe of the given length
-     * is taken from the middle of the text. The default length is 400 characters.
+     * is taken from the middle of the text. The default length is 1000.
      */
     private int probeLength = PROBE_LENGTH_DEFAULT;
 
@@ -117,21 +99,8 @@ public class LangIdEnhancementEngine imp
             Dictionary<String, String> properties = ce.getProperties();
             String lengthVal = properties.get(PROBE_LENGTH_PROP);
             probeLength = lengthVal == null ? PROBE_LENGTH_DEFAULT : Integer.parseInt(lengthVal);
-            confFile = properties.get(MODEL_CONFIGURATION_FILE_PROP);
-        }
-        if (confFile != null) {
-            languageIdentifier = new TextCategorizer(confFile);
-            if (languageIdentifier == null) {
-                throw new IOException("Could not initialize from configuration file: " + confFile);
-            }
-        } else {
-            languageIdentifier = new TextCategorizer();
-            InputStream in = getClass().getClassLoader().getResourceAsStream(LANGUAGE_MAP_DEFAULT);
-            if (in != null) {
-                languageLabelsMap.load(in);
-                in.close();
-            }
         }
+        LanguageIdentifier.initProfiles();
     }
 
     /**
@@ -140,7 +109,7 @@ public class LangIdEnhancementEngine imp
      * @param ce the {@link ComponentContext}
      */
     protected void deactivate(@SuppressWarnings("unused") ComponentContext ce) {
-        languageIdentifier = null;
+      
     }
 
     public int canEnhance(ContentItem ci) throws EngineException {
@@ -182,8 +151,8 @@ public class LangIdEnhancementEngine imp
         if (checkLength > 0 && text.length() > checkLength) {
             text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
         }
-        String language = languageIdentifier.categorize(text);
-        language = languageLabelsMap.getProperty(language, language);
+        LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
+        String language = languageIdentifier.getLanguage();
         log.info("language identified as " + language);
 
         // add language to metadata

Copied: incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java (from r1141894, incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java?p2=incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java&p1=incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java&r1=1141894&r2=1145588&rev=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java Tue Jul 12 13:18:46 2011
@@ -23,33 +23,29 @@ import java.util.Properties;
 import org.apache.commons.io.IOUtils;
 import org.junit.BeforeClass;
 import org.junit.Test;
-import org.knallgrau.utils.textcat.TextCategorizer;
+import org.apache.tika.language.LanguageIdentifier;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 
 /**
- * {@link TextCatTest} is a test class for {@link TextCategorizer}.
+ * {@link LangIdTest} is a test class for {@link TextCategorizer}.
  *
  * @author Joerg Steffen, DFKI
  * @version $Id$
  */
-public class TextCatTest {
+public class LangIdTest {
 
     /**
      * This contains the text categorizer to test.
      */
-    private static TextCategorizer tc;
-    private static Properties langMap = new Properties();
-
+  
     /**
      * This initializes the text categorizer.
      */
     @BeforeClass
     public static void oneTimeSetUp() throws IOException {
-        tc = new TextCategorizer();
-        InputStream in = tc.getClass().getClassLoader().getResourceAsStream("languageLabelsMap.txt");
-        langMap.load(in);
+        LanguageIdentifier.initProfiles();
     }
 
     /**
@@ -58,7 +54,7 @@ public class TextCatTest {
      * @throws IOException if there is an error when reading the text
      */
     @Test
-    public void testTextCat() throws IOException {
+    public void testLangId() throws IOException {
         String testFileName = "en.txt";
 
         InputStream in = this.getClass().getClassLoader().getResourceAsStream(
@@ -66,8 +62,9 @@ public class TextCatTest {
         assertNotNull("failed to load resource " + testFileName, in);
 
         String text = IOUtils.toString(in);
-        String language = tc.categorize(text);
-        assertEquals("en", langMap.getProperty(language, language));
+        LanguageIdentifier tc = new LanguageIdentifier(text);
+        String language = tc.getLanguage();
+        assertEquals("en", language);
     }
 
 }