You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/22 15:48:16 UTC

svn commit: r1376046 - in /incubator/stanbol/trunk/enhancer/engines/langdetect/src: main/java/org/apache/stanbol/enhancer/engines/langdetect/ main/resources/OSGI-INF/metatype/ test/java/org/apache/stanbol/enhancer/engines/langdetect/

Author: rwesten
Date: Wed Aug 22 13:48:16 2012
New Revision: 1376046

URL: http://svn.apache.org/viewvc?rev=1376046&view=rev
Log:
implementation of STANBOL-718: One can also configure the maximum number of suggested languages (default=3). Deactivated the probe-size feature as the documentation of the language detection framework states that it anyway uses randomly distributed sub-samples of the parsed text to reduce noice (e.g. from Named Entities)

Modified:
    incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
    incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java

Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java?rev=1376046&r1=1376045&r2=1376046&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java Wed Aug 22 13:48:16 2012
@@ -77,9 +77,14 @@ public class LanguageDetectionEnhancemen
     /**
      * a configurable value of the text segment length to check
      */
-    @Property
+    @Property(intValue=LanguageDetectionEnhancementEngine.PROBE_LENGTH_DEFAULT)
     public static final String PROBE_LENGTH_PROP = "org.apache.stanbol.enhancer.engines.langdetect.probe-length";
 
+    /**
+     * a configurable value of the maximum number of suggested languages
+     */
+    @Property(intValue=LanguageDetectionEnhancementEngine.DEFAULT_MAX_SUGGESTED_LANGUAGES)
+    public static final String MAX_SUGGESTED_PROP = "org.apache.stanbol.enhancer.engines.langdetect.max-suggested";
 
     /**
      * The default value for the Execution of this Engine. Currently set to
@@ -105,7 +110,19 @@ public class LanguageDetectionEnhancemen
      */
     private static final Logger log = LoggerFactory.getLogger(LanguageDetectionEnhancementEngine.class);
 
-    private static final int PROBE_LENGTH_DEFAULT = 1000;
+    /*
+     * NOTE: Checked the Documentation: The tool already supports the taking
+     * of several shorter samples randomly distributed over the parsed text
+     * to imrpove results and reduce noise. See
+     * http://code.google.com/p/language-detection/wiki/FrequentlyAskedQuestion
+     * "Each detected language differs for the same document" for a hint. 
+     */
+    private static final int PROBE_LENGTH_DEFAULT = -1;
+
+    /**
+     * Default value for the maximum number of suggested Languages
+     */
+    private static final int DEFAULT_MAX_SUGGESTED_LANGUAGES = 3;
 
     /**
      * How much text should be used for testing: If the value is 0 or smaller,
@@ -114,6 +131,8 @@ public class LanguageDetectionEnhancemen
      */
     private int probeLength = PROBE_LENGTH_DEFAULT;
     
+    private int maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
+    
     /**
      * The literal factory
      */
@@ -134,8 +153,34 @@ public class LanguageDetectionEnhancemen
         if (ce != null) {
             @SuppressWarnings("unchecked")
             Dictionary<String, String> properties = ce.getProperties();
-            String lengthVal = properties.get(PROBE_LENGTH_PROP);
-            probeLength = lengthVal == null ? PROBE_LENGTH_DEFAULT : Integer.parseInt(lengthVal);
+            Object value = properties.get(PROBE_LENGTH_PROP);
+            if(value instanceof Number){
+                probeLength = ((Number)value).intValue();
+            } else if(value != null){
+                try {
+                    probeLength = Integer.parseInt(value.toString());
+                } catch (NumberFormatException e) {
+                    throw new ConfigurationException(PROBE_LENGTH_PROP, 
+                        "The parsed 'proble length' MUST be a valid Integer", e);
+                }
+            } else {
+                probeLength = PROBE_LENGTH_DEFAULT;
+            }
+            value = properties.get(MAX_SUGGESTED_PROP);
+            if(value instanceof Number){
+                maxSuggestedLanguages = ((Number)value).intValue();
+            } else if(value != null){
+                try {
+                    maxSuggestedLanguages = Integer.parseInt(value.toString());
+                } catch (NumberFormatException e) {
+                    throw new ConfigurationException(MAX_SUGGESTED_PROP, 
+                        "The parsed number of the maximum suggested lanugages "
+                        + "MUST BE a valid Integer", e);
+                }
+            }
+            if(maxSuggestedLanguages < 1){
+                maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
+            }
         }
         languageIdentifier = new LanguageIdentifier();
     }
@@ -143,6 +188,8 @@ public class LanguageDetectionEnhancemen
     protected void deactivate(ComponentContext ce) {
         super.deactivate(ce);
         this.languageIdentifier = null;
+        this.maxSuggestedLanguages = -1;
+        this.probeLength = -1;
     }
 
     public int canEnhance(ContentItem ci) throws EngineException {
@@ -190,16 +237,20 @@ public class LanguageDetectionEnhancemen
         }
         
         // add language to metadata
-        if (languages.size() > 0) {
+        if (languages != null) {
             MGraph g = ci.getMetadata();
             ci.getLock().writeLock().lock();
-            // add best hypothesis
-            Language oneLang = languages.get(0);
             try {
-                UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
-                g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(oneLang.lang)));
-                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(oneLang.prob)));
-                g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
+                for(int i=0;i<maxSuggestedLanguages && i<languages.size();i++){
+                    // add a hypothesis
+                    Language hypothesis = languages.get(i);
+                    UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                    g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
+                    g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
+                    g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
+                    g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, 
+                        literalFactory.createTypedLiteral(hypothesis.prob)));
+                }
             } finally {
                 ci.getLock().writeLock().unlock();
             }

Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1376046&r1=1376045&r2=1376046&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Aug 22 13:48:16 2012
@@ -30,3 +30,13 @@ org.apache.stanbol.enhancer.engines.lang
 Enhancer Engine: Language Identification
 org.apache.stanbol.enhancer.engines.langdetect.LanguageDetectionEnhancementEngine.description=Detects \
 the Language for parsed Text.
+
+org.apache.stanbol.enhancer.engines.langdetect.max-suggested.name=Max Suggested Languages
+org.apache.stanbol.enhancer.engines.langdetect.max-suggested.description=This \
+Engine supports the suggestion of multiple languages with confidence values. This \
+allows to configure how much languages are suggested at a maximum (default: 3).
+
+org.apache.stanbol.enhancer.engines.langdetect.probe-length.name=Probe Length
+org.apache.stanbol.enhancer.engines.langdetect.probe-length.description= The \
+maximum number of characters used for language detection. Note that the used \
+library already supports random selection of text parts (default: -1 (deactivated))

Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java?rev=1376046&r1=1376045&r2=1376046&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java Wed Aug 22 13:48:16 2012
@@ -21,6 +21,7 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -40,6 +41,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.osgi.service.cm.ConfigurationException;
@@ -122,10 +124,9 @@ public class LanguageDetectionEngineTest
         expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
             langIdEngine.getClass().getName()));
         int textAnnotationCount = validateAllTextAnnotations(ci.getMetadata(), text, expectedValues);
-        assertEquals("A single TextAnnotation is expected", 1,textAnnotationCount);
-        //even through this tests do not validate service quality but rather
-        //the correct integration of the CELI service as EnhancementEngine
-        //we expect the "en" is detected for the parsed text
+        assertTrue("A TextAnnotation is expected", textAnnotationCount > 0);
+        //even through this tests do not validate detection quality
+        //we expect the "en" is detected as best guess for the parsed text
         assertEquals("The detected language for text '"+text+"' MUST BE 'en'",
             "en",EnhancementEngineHelper.getLanguage(ci));