You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/22 15:48:16 UTC
svn commit: r1376046 - in
/incubator/stanbol/trunk/enhancer/engines/langdetect/src:
main/java/org/apache/stanbol/enhancer/engines/langdetect/
main/resources/OSGI-INF/metatype/
test/java/org/apache/stanbol/enhancer/engines/langdetect/
Author: rwesten
Date: Wed Aug 22 13:48:16 2012
New Revision: 1376046
URL: http://svn.apache.org/viewvc?rev=1376046&view=rev
Log:
implementation of STANBOL-718: One can also configure the maximum number of suggested languages (default=3). Deactivated the probe-size feature as the documentation of the language detection framework states that it anyway uses randomly distributed sub-samples of the parsed text to reduce noice (e.g. from Named Entities)
Modified:
incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java?rev=1376046&r1=1376045&r2=1376046&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java Wed Aug 22 13:48:16 2012
@@ -77,9 +77,14 @@ public class LanguageDetectionEnhancemen
/**
* a configurable value of the text segment length to check
*/
- @Property
+ @Property(intValue=LanguageDetectionEnhancementEngine.PROBE_LENGTH_DEFAULT)
public static final String PROBE_LENGTH_PROP = "org.apache.stanbol.enhancer.engines.langdetect.probe-length";
+ /**
+ * a configurable value of the maximum number of suggested languages
+ */
+ @Property(intValue=LanguageDetectionEnhancementEngine.DEFAULT_MAX_SUGGESTED_LANGUAGES)
+ public static final String MAX_SUGGESTED_PROP = "org.apache.stanbol.enhancer.engines.langdetect.max-suggested";
/**
* The default value for the Execution of this Engine. Currently set to
@@ -105,7 +110,19 @@ public class LanguageDetectionEnhancemen
*/
private static final Logger log = LoggerFactory.getLogger(LanguageDetectionEnhancementEngine.class);
- private static final int PROBE_LENGTH_DEFAULT = 1000;
+ /*
+ * NOTE: Checked the Documentation: The tool already supports the taking
+ * of several shorter samples randomly distributed over the parsed text
+ * to imrpove results and reduce noise. See
+ * http://code.google.com/p/language-detection/wiki/FrequentlyAskedQuestion
+ * "Each detected language differs for the same document" for a hint.
+ */
+ private static final int PROBE_LENGTH_DEFAULT = -1;
+
+ /**
+ * Default value for the maximum number of suggested Languages
+ */
+ private static final int DEFAULT_MAX_SUGGESTED_LANGUAGES = 3;
/**
* How much text should be used for testing: If the value is 0 or smaller,
@@ -114,6 +131,8 @@ public class LanguageDetectionEnhancemen
*/
private int probeLength = PROBE_LENGTH_DEFAULT;
+ private int maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
+
/**
* The literal factory
*/
@@ -134,8 +153,34 @@ public class LanguageDetectionEnhancemen
if (ce != null) {
@SuppressWarnings("unchecked")
Dictionary<String, String> properties = ce.getProperties();
- String lengthVal = properties.get(PROBE_LENGTH_PROP);
- probeLength = lengthVal == null ? PROBE_LENGTH_DEFAULT : Integer.parseInt(lengthVal);
+ Object value = properties.get(PROBE_LENGTH_PROP);
+ if(value instanceof Number){
+ probeLength = ((Number)value).intValue();
+ } else if(value != null){
+ try {
+ probeLength = Integer.parseInt(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(PROBE_LENGTH_PROP,
+ "The parsed 'proble length' MUST be a valid Integer", e);
+ }
+ } else {
+ probeLength = PROBE_LENGTH_DEFAULT;
+ }
+ value = properties.get(MAX_SUGGESTED_PROP);
+ if(value instanceof Number){
+ maxSuggestedLanguages = ((Number)value).intValue();
+ } else if(value != null){
+ try {
+ maxSuggestedLanguages = Integer.parseInt(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(MAX_SUGGESTED_PROP,
+ "The parsed number of the maximum suggested lanugages "
+ + "MUST BE a valid Integer", e);
+ }
+ }
+ if(maxSuggestedLanguages < 1){
+ maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
+ }
}
languageIdentifier = new LanguageIdentifier();
}
@@ -143,6 +188,8 @@ public class LanguageDetectionEnhancemen
protected void deactivate(ComponentContext ce) {
super.deactivate(ce);
this.languageIdentifier = null;
+ this.maxSuggestedLanguages = -1;
+ this.probeLength = -1;
}
public int canEnhance(ContentItem ci) throws EngineException {
@@ -190,16 +237,20 @@ public class LanguageDetectionEnhancemen
}
// add language to metadata
- if (languages.size() > 0) {
+ if (languages != null) {
MGraph g = ci.getMetadata();
ci.getLock().writeLock().lock();
- // add best hypothesis
- Language oneLang = languages.get(0);
try {
- UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(oneLang.lang)));
- g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(oneLang.prob)));
- g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
+ for(int i=0;i<maxSuggestedLanguages && i<languages.size();i++){
+ // add a hypothesis
+ Language hypothesis = languages.get(i);
+ UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
+ g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
+ g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
+ g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE,
+ literalFactory.createTypedLiteral(hypothesis.prob)));
+ }
} finally {
ci.getLock().writeLock().unlock();
}
Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1376046&r1=1376045&r2=1376046&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Aug 22 13:48:16 2012
@@ -30,3 +30,13 @@ org.apache.stanbol.enhancer.engines.lang
Enhancer Engine: Language Identification
org.apache.stanbol.enhancer.engines.langdetect.LanguageDetectionEnhancementEngine.description=Detects \
the Language for parsed Text.
+
+org.apache.stanbol.enhancer.engines.langdetect.max-suggested.name=Max Suggested Languages
+org.apache.stanbol.enhancer.engines.langdetect.max-suggested.description=This \
+Engine supports the suggestion of multiple languages with confidence values. This \
+allows to configure how much languages are suggested at a maximum (default: 3).
+
+org.apache.stanbol.enhancer.engines.langdetect.probe-length.name=Probe Length
+org.apache.stanbol.enhancer.engines.langdetect.probe-length.description= The \
+maximum number of characters used for language detection. Note that the used \
+library already supports random selection of text parts (default: -1 (deactivated))
Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java?rev=1376046&r1=1376045&r2=1376046&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java Wed Aug 22 13:48:16 2012
@@ -21,6 +21,7 @@ import static org.apache.stanbol.enhance
import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
@@ -40,6 +41,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.osgi.service.cm.ConfigurationException;
@@ -122,10 +124,9 @@ public class LanguageDetectionEngineTest
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
langIdEngine.getClass().getName()));
int textAnnotationCount = validateAllTextAnnotations(ci.getMetadata(), text, expectedValues);
- assertEquals("A single TextAnnotation is expected", 1,textAnnotationCount);
- //even through this tests do not validate service quality but rather
- //the correct integration of the CELI service as EnhancementEngine
- //we expect the "en" is detected for the parsed text
+ assertTrue("A TextAnnotation is expected", textAnnotationCount > 0);
+ //even through this tests do not validate detection quality
+ //we expect the "en" is detected as best guess for the parsed text
assertEquals("The detected language for text '"+text+"' MUST BE 'en'",
"en",EnhancementEngineHelper.getLanguage(ci));