You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/12/21 21:03:53 UTC

svn commit: r1425121 - in /stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene: ./ src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/ src/test/java/org/apache/stanbol/enhancer/engines/entityhubli...

Author: rwesten
Date: Fri Dec 21 20:03:52 2012
New Revision: 1425121

URL: http://svn.apache.org/viewvc?rev=1425121&view=rev
Log:
STANBOL-849: added support for configuring TokenFilterFactories. Now it is possible to use whole Analysing Chains as LabelTokenizer. Also added an example/unit test based on the smartcn analyzer (Chinese)

Added:
    stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java
Modified:
    stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml
    stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java

Modified: stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml?rev=1425121&r1=1425120&r2=1425121&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml Fri Dec 21 20:03:52 2012
@@ -105,6 +105,20 @@
       <artifactId>slf4j-simple</artifactId>
       <scope>test</scope>      
     </dependency>
+    
+    <dependency> <!-- To test ch text tokenizing conf -->
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-smartcn</artifactId>
+      <version>3.6.1</version>
+      <scope>test</scope>      
+    </dependency>
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-analysis-extras</artifactId>
+      <scope>test</scope>      
+    </dependency>
+    
+    
   </dependencies>
 
 </project>

Modified: stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java?rev=1425121&r1=1425120&r2=1425121&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java Fri Dec 21 20:03:52 2012
@@ -4,6 +4,8 @@ package org.apache.stanbol.enhancer.engi
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 
@@ -14,10 +16,11 @@ import org.apache.felix.scr.annotations.
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.Version;
+import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerFactory;
 import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
@@ -34,6 +37,7 @@ import org.slf4j.LoggerFactory;
     metatype=true)
 @Properties(value={
         @Property(name=LuceneLabelTokenizer.PROPERTY_TOKENIZER_FACTORY,value="{full-qualified-class-name}"),
+        @Property(name=LuceneLabelTokenizer.PROPERTY_TOKEN_FILTER_FACTORY,cardinality=Integer.MAX_VALUE,value=""),
         @Property(name=LabelTokenizer.SUPPORTED_LANUAGES,value="{lang1},{lang2},!{lang3},{*}"),
         @Property(name=Constants.SERVICE_RANKING,intValue=0)
 })
@@ -44,7 +48,9 @@ public class LuceneLabelTokenizer implem
     private static final String[] EMPTY = new String[]{};
     
     public static final String PROPERTY_TOKENIZER_FACTORY = "enhancer.engine.linking.labeltokenizer.lucene.tokenizerFactory";
+    public static final String PROPERTY_TOKEN_FILTER_FACTORY = "enhancer.engine.linking.labeltokenizer.lucene.tokenFilterFactory";
     private TokenizerFactory tokenizerFactory;
+    private List<TokenFilterFactory> filterFactories = new ArrayList<TokenFilterFactory>();
     private LanguageConfiguration langConf = new LanguageConfiguration(SUPPORTED_LANUAGES, new String[]{});
     
     @Activate
@@ -57,6 +63,7 @@ public class LuceneLabelTokenizer implem
         Class<?> tokenizerFactoryClass;
         try {
             tokenizerFactoryClass = getClass().getClassLoader().loadClass(value.toString());
+            log.info(" ... adding {}",tokenizerFactoryClass.getSimpleName());
         } catch (ClassNotFoundException e) {
             throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "Unable to load the "
                 + "class for the parsed name '"+value+"'!");
@@ -79,6 +86,56 @@ public class LuceneLabelTokenizer implem
             throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "The parsed class '"
                     + tokenizerFactoryClass +"' is not assignable to "+TokenizerFactory.class);
         }
+        Collection<String> values;
+        value = ctx.getProperties().get(PROPERTY_TOKEN_FILTER_FACTORY);
+        if(value == null){
+            values = Collections.emptyList();
+        } else if(value instanceof Collection<?>){
+            values = new ArrayList<String>(((Collection<?>)value).size());
+            for(Object v : (Collection<Object>)value){
+                if(v != null && !v.toString().isEmpty()){
+                    values.add(v.toString());
+                }
+            }
+        } else if(value instanceof String[]){
+            values = Arrays.asList((String[])value);
+        } else if(value instanceof String){
+            values = Collections.singleton((String)value);
+        } else {
+            throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The type '"
+                + value.getClass()+"' of the parsed value is not supported (supported are "
+                + "Collections, String[] and String values)!");
+        }
+        for(String filterClassName : values){
+            Class<?> tokenFilterFactoryClass;
+            try {
+                tokenFilterFactoryClass = getClass().getClassLoader().loadClass(filterClassName);
+                log.info(" ... adding {}",tokenFilterFactoryClass.getSimpleName());
+            } catch (ClassNotFoundException e) {
+                throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "Unable to load the "
+                    + "class for the parsed name '"+filterClassName+"'!");
+            }
+            Object filterFactoryObject;
+            try {
+                filterFactoryObject = tokenFilterFactoryClass.newInstance();
+            } catch (InstantiationException e) {
+                throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "Unable to instantiate the "
+                        + "class '"+tokenFilterFactoryClass+"'!", e);
+            } catch (IllegalAccessException e) {
+                throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "Unable to instantiate the "
+                        + "class '"+tokenFilterFactoryClass+"'!", e);
+            }
+            
+            if(filterFactoryObject instanceof TokenFilterFactory){
+                TokenFilterFactory tff = (TokenFilterFactory)filterFactoryObject;
+                tff.init(Collections.singletonMap("luceneMatchVersion", Version.LUCENE_36.toString()));
+                filterFactories.add(tff);
+            } else {
+                throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The parsed class '"
+                        + tokenFilterFactoryClass +"' is not assignable to "+TokenFilterFactory.class);
+            }
+            
+        }
         //init the language configuration
         value = ctx.getProperties().get(LabelTokenizer.SUPPORTED_LANUAGES);
         if(value == null){
@@ -104,7 +161,11 @@ public class LuceneLabelTokenizer implem
             if(label.isEmpty()){
                 return EMPTY;
             }
-            Tokenizer tokenizer = tokenizerFactory.create(new StringReader(label));
+            //build the analysing chain
+            TokenStream tokenizer = tokenizerFactory.create(new StringReader(label));
+            for(TokenFilterFactory filterFactory : filterFactories){
+                tokenizer = filterFactory.create(tokenizer); 
+            }
             List<String> tokens = new ArrayList<String>(8);
             try {
                 while(tokenizer.incrementToken()){

Added: stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java?rev=1425121&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java Fri Dec 21 20:03:52 2012
@@ -0,0 +1,82 @@
+package org.apache.stanbol.enhancer.engines.entityhublinking.labeltokenizer.lucene;
+
+import java.util.Arrays;
+import java.util.Dictionary;
+import java.util.Hashtable;
+import java.util.List;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+
+public class TokenizerAndTokenFIlterTest {
+
+    private static final Object TOKENIZER_FACTORY_CLASS = "org.apache.solr.analysis.SmartChineseSentenceTokenizerFactory";
+    private static final String[] TOKEN_FILTER_FACTORY_CLASSES = new String[]{
+        "org.apache.solr.analysis.SmartChineseWordTokenFilterFactory"
+    };
+    private static LuceneLabelTokenizer luceneLabelTokenizer;
+
+    @BeforeClass
+    public static void init() throws ConfigurationException {
+        Dictionary<String,Object> config = new Hashtable<String,Object>();
+        config.put(LuceneLabelTokenizer.PROPERTY_TOKENIZER_FACTORY, TOKENIZER_FACTORY_CLASS);
+        config.put(LuceneLabelTokenizer.PROPERTY_TOKEN_FILTER_FACTORY,TOKEN_FILTER_FACTORY_CLASSES);
+        config.put(LabelTokenizer.SUPPORTED_LANUAGES, "zh");
+        ComponentContext cc = new MockComponentContext(config);
+        luceneLabelTokenizer = new LuceneLabelTokenizer();
+        luceneLabelTokenizer.activate(cc);
+    }
+    
+    @Test(expected=IllegalArgumentException.class)
+    public void testNullLabel(){
+        luceneLabelTokenizer.tokenize(null, "zh");
+    }
+    @Test
+    public void testNullLanguate(){
+        Assert.assertNull(luceneLabelTokenizer.tokenize("test", null));
+    }
+    @Test
+    public void testUnsupportedLanguage(){
+        Assert.assertNull(luceneLabelTokenizer.tokenize("test", "de"));
+    }
+    @Test
+    public void testLuceneLabelTokenizer(){
+        //As I do have no Idea of Chinese those test validate only results I
+        //was getting when testing. So this ensures only that the behavioure
+        //does not change
+        //BBC
+        String label = "英国广播公司";
+        String[] expected = new String[]{"英国","广播","公司"};
+        String[] tokens = luceneLabelTokenizer.tokenize(label, "zh");
+        Assert.assertNotNull(tokens);
+        Assert.assertArrayEquals(expected, tokens);
+        //Yellow Sea (one word??)
+        label = "黄海";
+        expected = new String[]{"黄海"};
+        tokens = luceneLabelTokenizer.tokenize(label, "zh");
+        Assert.assertNotNull(tokens);
+        Assert.assertArrayEquals(expected, tokens);
+        //Barack Obama
+        label = "贝拉克·奥巴马";
+        expected = new String[]{"贝","拉","克","·","奥","巴马"};
+        tokens = luceneLabelTokenizer.tokenize(label, "zh");
+        Assert.assertNotNull(tokens);
+        Assert.assertArrayEquals(expected, tokens);
+    }
+    @Test
+    public void testEmptyLabel(){
+        String[] tokens = luceneLabelTokenizer.tokenize("", "zh");
+        Assert.assertNotNull(tokens);
+        Assert.assertTrue(tokens.length == 0);
+    }
+    
+    @AfterClass
+    public static void close(){
+        luceneLabelTokenizer.deactivate(null);
+    }
+}