You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2011/07/12 15:18:47 UTC
svn commit: r1145588 - in /incubator/stanbol/trunk/enhancer/engines/langid:
./ src/main/java/org/apache/stanbol/enhancer/engines/langid/
src/main/resources/
src/test/java/org/apache/stanbol/enhancer/engines/langid/core/
Author: wkasper
Date: Tue Jul 12 13:18:46 2011
New Revision: 1145588
URL: http://svn.apache.org/viewvc?rev=1145588&view=rev
Log:
Stanbol-61: Use Apache Tika language identifier instead of TextCat
Added:
incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java
- copied, changed from r1141894, incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java
Removed:
incubator/stanbol/trunk/enhancer/engines/langid/src/main/resources/languageLabelsMap.txt
incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java
Modified:
incubator/stanbol/trunk/enhancer/engines/langid/README.md
incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
Modified: incubator/stanbol/trunk/enhancer/engines/langid/README.md
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/README.md?rev=1145588&r1=1145587&r2=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/README.md (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/README.md Tue Jul 12 13:18:46 2011
@@ -4,7 +4,7 @@ The **LangId** engine determines the lan
## Technical Description
-The provided engine is based on the [TextCat library](http://textcat.sourceforge.net/).
+The provided engine is based on the language identifier of [Apache Tika](http://tika.apache.org/).
The text to be checked must be provided in plain text format in one of two forms:
* a plain text content item
@@ -28,6 +28,7 @@ By default the language identifier disti
* German: de
* English: en
+* Estonian: et
* French: fr
* Spanish: es
* Italian: it
@@ -36,13 +37,16 @@ By default the language identifier disti
* Dutch: nl
* Norwegian: no
* Finnish: fi
-* Albanian: sq
-* Slovak (ASCII): sk
-* Slovenian (ASCII): sl
+* Greek: el
* Danish: da
* Hungarian: hu
+* Icelandic: is
+* Lithuanian: lt
+* Portuguese: pt
+* Russian: ru
+* Thai: th
-Tools for creating new or additional language models are provided by the underlying TextCat system as documented at [http://textcat.sourceforge.net](http://textcat.sourceforge.net/).
+Additional language models can be created as Tika [LanguageProfile](org.apache.tika.language.LanguageProfile).
## Configuration options
@@ -53,13 +57,6 @@ Tools for creating new or additional lan
text. Otherwise only a substring of the specified length taken from the
middle of the text will be used. The default value is 400 characters.
-* <pre><code>org.apache.stanbol.enhancer.engines.langid.model-configuration-file</pre></code>
-
- the name of a file that defines which statistical language models are
- used and the mappings from statistical model names to language labels
- that will appear as the value in the enhancement structure. By default
- the resource file *languageLabelsMap.txt* is used.
-
## Usage
Assuming that the Stanbol endpoint with the full launcher is running at
Modified: incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/pom.xml?rev=1145588&r1=1145587&r2=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/pom.xml Tue Jul 12 13:18:46 2011
@@ -86,8 +86,8 @@
</dependency>
<dependency>
- <groupId>net.sourceforge</groupId>
- <artifactId>textcat</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
</dependency>
<dependency>
@@ -114,19 +114,4 @@
</dependency>
</dependencies>
- <repositories>
- <repository>
- <releases>
- <enabled>true</enabled>
- <updatePolicy>never</updatePolicy>
- </releases>
- <snapshots>
- <enabled>false</enabled>
- </snapshots>
- <id>dfki-lt-repo</id>
- <name>DFKI LT Repository</name>
- <url>http://www.dfki.de/~kasper/IKS/maven</url>
- </repository>
- </repositories>
-
</project>
Modified: incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java?rev=1145588&r1=1145587&r2=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java Tue Jul 12 13:18:46 2011
@@ -20,12 +20,10 @@ import static org.apache.stanbol.enhance
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
import java.io.IOException;
-import java.io.InputStream;
import java.util.Collections;
import java.util.Dictionary;
import java.util.Iterator;
import java.util.Map;
-import java.util.Properties;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Triple;
@@ -42,7 +40,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
-import org.knallgrau.utils.textcat.TextCategorizer;
+import org.apache.tika.language.LanguageIdentifier;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -64,13 +62,6 @@ public class LangIdEnhancementEngine imp
@Property
public static final String PROBE_LENGTH_PROP = "org.apache.stanbol.enhancer.engines.langid.probe-length";
- /**
- * this allows to specify the path to a configuration file that specifies
- * the language models and how they map to language labels.
- */
- @Property
- public static final String MODEL_CONFIGURATION_FILE_PROP
- = "org.apache.stanbol.enhancer.engines.langid.model-configuration-file";
/**
* The default value for the Execution of this Engine. Currently set to
@@ -88,21 +79,12 @@ public class LangIdEnhancementEngine imp
*/
private static final Logger log = LoggerFactory.getLogger(LangIdEnhancementEngine.class);
- private static final String LANGUAGE_MAP_DEFAULT = "languageLabelsMap.txt";
-
- private Properties languageLabelsMap = new Properties();
-
- /**
- * This contains the language identifier.
- */
- private TextCategorizer languageIdentifier;
-
- private static final int PROBE_LENGTH_DEFAULT = 400;
+ private static final int PROBE_LENGTH_DEFAULT = 1000;
/**
* How much text should be used for testing: If the value is 0 or smaller,
* the complete text will be used. Otherwise a text probe of the given length
- * is taken from the middle of the text. The default length is 400 characters.
+ * is taken from the middle of the text. The default length is 1000.
*/
private int probeLength = PROBE_LENGTH_DEFAULT;
@@ -117,21 +99,8 @@ public class LangIdEnhancementEngine imp
Dictionary<String, String> properties = ce.getProperties();
String lengthVal = properties.get(PROBE_LENGTH_PROP);
probeLength = lengthVal == null ? PROBE_LENGTH_DEFAULT : Integer.parseInt(lengthVal);
- confFile = properties.get(MODEL_CONFIGURATION_FILE_PROP);
- }
- if (confFile != null) {
- languageIdentifier = new TextCategorizer(confFile);
- if (languageIdentifier == null) {
- throw new IOException("Could not initialize from configuration file: " + confFile);
- }
- } else {
- languageIdentifier = new TextCategorizer();
- InputStream in = getClass().getClassLoader().getResourceAsStream(LANGUAGE_MAP_DEFAULT);
- if (in != null) {
- languageLabelsMap.load(in);
- in.close();
- }
}
+ LanguageIdentifier.initProfiles();
}
/**
@@ -140,7 +109,7 @@ public class LangIdEnhancementEngine imp
* @param ce the {@link ComponentContext}
*/
protected void deactivate(@SuppressWarnings("unused") ComponentContext ce) {
- languageIdentifier = null;
+
}
public int canEnhance(ContentItem ci) throws EngineException {
@@ -182,8 +151,8 @@ public class LangIdEnhancementEngine imp
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
- String language = languageIdentifier.categorize(text);
- language = languageLabelsMap.getProperty(language, language);
+ LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
+ String language = languageIdentifier.getLanguage();
log.info("language identified as " + language);
// add language to metadata
Copied: incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java (from r1141894, incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java?p2=incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java&p1=incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java&r1=1141894&r2=1145588&rev=1145588&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/TextCatTest.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/src/test/java/org/apache/stanbol/enhancer/engines/langid/core/LangIdTest.java Tue Jul 12 13:18:46 2011
@@ -23,33 +23,29 @@ import java.util.Properties;
import org.apache.commons.io.IOUtils;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.knallgrau.utils.textcat.TextCategorizer;
+import org.apache.tika.language.LanguageIdentifier;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
/**
- * {@link TextCatTest} is a test class for {@link TextCategorizer}.
+ * {@link LangIdTest} is a test class for {@link TextCategorizer}.
*
* @author Joerg Steffen, DFKI
* @version $Id$
*/
-public class TextCatTest {
+public class LangIdTest {
/**
* This contains the text categorizer to test.
*/
- private static TextCategorizer tc;
- private static Properties langMap = new Properties();
-
+
/**
* This initializes the text categorizer.
*/
@BeforeClass
public static void oneTimeSetUp() throws IOException {
- tc = new TextCategorizer();
- InputStream in = tc.getClass().getClassLoader().getResourceAsStream("languageLabelsMap.txt");
- langMap.load(in);
+ LanguageIdentifier.initProfiles();
}
/**
@@ -58,7 +54,7 @@ public class TextCatTest {
* @throws IOException if there is an error when reading the text
*/
@Test
- public void testTextCat() throws IOException {
+ public void testLangId() throws IOException {
String testFileName = "en.txt";
InputStream in = this.getClass().getClassLoader().getResourceAsStream(
@@ -66,8 +62,9 @@ public class TextCatTest {
assertNotNull("failed to load resource " + testFileName, in);
String text = IOUtils.toString(in);
- String language = tc.categorize(text);
- assertEquals("en", langMap.getProperty(language, language));
+ LanguageIdentifier tc = new LanguageIdentifier(text);
+ String language = tc.getLanguage();
+ assertEquals("en", language);
}
}