You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/12/21 21:03:53 UTC
svn commit: r1425121 - in
/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene: ./
src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/
src/test/java/org/apache/stanbol/enhancer/engines/entityhubli...
Author: rwesten
Date: Fri Dec 21 20:03:52 2012
New Revision: 1425121
URL: http://svn.apache.org/viewvc?rev=1425121&view=rev
Log:
STANBOL-849: added support for configuring TokenFilterFactories. Now it is possible to use whole Analysing Chains as LabelTokenizer. Also added an example/unit test based on the smartcn analyzer (Chinese)
Added:
stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java
Modified:
stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml
stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java
Modified: stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml?rev=1425121&r1=1425120&r2=1425121&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/pom.xml Fri Dec 21 20:03:52 2012
@@ -105,6 +105,20 @@
<artifactId>slf4j-simple</artifactId>
<scope>test</scope>
</dependency>
+
+ <dependency> <!-- To test ch text tokenizing conf -->
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-smartcn</artifactId>
+ <version>3.6.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-analysis-extras</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+
</dependencies>
</project>
Modified: stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java?rev=1425121&r1=1425120&r2=1425121&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/LuceneLabelTokenizer.java Fri Dec 21 20:03:52 2012
@@ -4,6 +4,8 @@ package org.apache.stanbol.enhancer.engi
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.List;
@@ -14,10 +16,11 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.Version;
+import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerFactory;
import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
@@ -34,6 +37,7 @@ import org.slf4j.LoggerFactory;
metatype=true)
@Properties(value={
@Property(name=LuceneLabelTokenizer.PROPERTY_TOKENIZER_FACTORY,value="{full-qualified-class-name}"),
+ @Property(name=LuceneLabelTokenizer.PROPERTY_TOKEN_FILTER_FACTORY,cardinality=Integer.MAX_VALUE,value=""),
@Property(name=LabelTokenizer.SUPPORTED_LANUAGES,value="{lang1},{lang2},!{lang3},{*}"),
@Property(name=Constants.SERVICE_RANKING,intValue=0)
})
@@ -44,7 +48,9 @@ public class LuceneLabelTokenizer implem
private static final String[] EMPTY = new String[]{};
public static final String PROPERTY_TOKENIZER_FACTORY = "enhancer.engine.linking.labeltokenizer.lucene.tokenizerFactory";
+ public static final String PROPERTY_TOKEN_FILTER_FACTORY = "enhancer.engine.linking.labeltokenizer.lucene.tokenFilterFactory";
private TokenizerFactory tokenizerFactory;
+ private List<TokenFilterFactory> filterFactories = new ArrayList<TokenFilterFactory>();
private LanguageConfiguration langConf = new LanguageConfiguration(SUPPORTED_LANUAGES, new String[]{});
@Activate
@@ -57,6 +63,7 @@ public class LuceneLabelTokenizer implem
Class<?> tokenizerFactoryClass;
try {
tokenizerFactoryClass = getClass().getClassLoader().loadClass(value.toString());
+ log.info(" ... adding {}",tokenizerFactoryClass.getSimpleName());
} catch (ClassNotFoundException e) {
throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "Unable to load the "
+ "class for the parsed name '"+value+"'!");
@@ -79,6 +86,56 @@ public class LuceneLabelTokenizer implem
throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "The parsed class '"
+ tokenizerFactoryClass +"' is not assignable to "+TokenizerFactory.class);
}
+ Collection<String> values;
+ value = ctx.getProperties().get(PROPERTY_TOKEN_FILTER_FACTORY);
+ if(value == null){
+ values = Collections.emptyList();
+ } else if(value instanceof Collection<?>){
+ values = new ArrayList<String>(((Collection<?>)value).size());
+ for(Object v : (Collection<Object>)value){
+ if(v != null && !v.toString().isEmpty()){
+ values.add(v.toString());
+ }
+ }
+ } else if(value instanceof String[]){
+ values = Arrays.asList((String[])value);
+ } else if(value instanceof String){
+ values = Collections.singleton((String)value);
+ } else {
+ throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The type '"
+ + value.getClass()+"' of the parsed value is not supported (supported are "
+ + "Collections, String[] and String values)!");
+ }
+ for(String filterClassName : values){
+ Class<?> tokenFilterFactoryClass;
+ try {
+ tokenFilterFactoryClass = getClass().getClassLoader().loadClass(filterClassName);
+ log.info(" ... adding {}",tokenFilterFactoryClass.getSimpleName());
+ } catch (ClassNotFoundException e) {
+ throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "Unable to load the "
+ + "class for the parsed name '"+filterClassName+"'!");
+ }
+ Object filterFactoryObject;
+ try {
+ filterFactoryObject = tokenFilterFactoryClass.newInstance();
+ } catch (InstantiationException e) {
+ throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "Unable to instantiate the "
+ + "class '"+tokenFilterFactoryClass+"'!", e);
+ } catch (IllegalAccessException e) {
+ throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "Unable to instantiate the "
+ + "class '"+tokenFilterFactoryClass+"'!", e);
+ }
+
+ if(filterFactoryObject instanceof TokenFilterFactory){
+ TokenFilterFactory tff = (TokenFilterFactory)filterFactoryObject;
+ tff.init(Collections.singletonMap("luceneMatchVersion", Version.LUCENE_36.toString()));
+ filterFactories.add(tff);
+ } else {
+ throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The parsed class '"
+ + tokenFilterFactoryClass +"' is not assignable to "+TokenFilterFactory.class);
+ }
+
+ }
//init the language configuration
value = ctx.getProperties().get(LabelTokenizer.SUPPORTED_LANUAGES);
if(value == null){
@@ -104,7 +161,11 @@ public class LuceneLabelTokenizer implem
if(label.isEmpty()){
return EMPTY;
}
- Tokenizer tokenizer = tokenizerFactory.create(new StringReader(label));
+ //build the analysing chain
+ TokenStream tokenizer = tokenizerFactory.create(new StringReader(label));
+ for(TokenFilterFactory filterFactory : filterFactories){
+ tokenizer = filterFactory.create(tokenizer);
+ }
List<String> tokens = new ArrayList<String>(8);
try {
while(tokenizer.incrementToken()){
Added: stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java?rev=1425121&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/labeltokenizer-lucene/src/test/java/org/apache/stanbol/enhancer/engines/entityhublinking/labeltokenizer/lucene/TokenizerAndTokenFIlterTest.java Fri Dec 21 20:03:52 2012
@@ -0,0 +1,82 @@
+package org.apache.stanbol.enhancer.engines.entityhublinking.labeltokenizer.lucene;
+
+import java.util.Arrays;
+import java.util.Dictionary;
+import java.util.Hashtable;
+import java.util.List;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+
+public class TokenizerAndTokenFIlterTest {
+
+ private static final Object TOKENIZER_FACTORY_CLASS = "org.apache.solr.analysis.SmartChineseSentenceTokenizerFactory";
+ private static final String[] TOKEN_FILTER_FACTORY_CLASSES = new String[]{
+ "org.apache.solr.analysis.SmartChineseWordTokenFilterFactory"
+ };
+ private static LuceneLabelTokenizer luceneLabelTokenizer;
+
+ @BeforeClass
+ public static void init() throws ConfigurationException {
+ Dictionary<String,Object> config = new Hashtable<String,Object>();
+ config.put(LuceneLabelTokenizer.PROPERTY_TOKENIZER_FACTORY, TOKENIZER_FACTORY_CLASS);
+ config.put(LuceneLabelTokenizer.PROPERTY_TOKEN_FILTER_FACTORY,TOKEN_FILTER_FACTORY_CLASSES);
+ config.put(LabelTokenizer.SUPPORTED_LANUAGES, "zh");
+ ComponentContext cc = new MockComponentContext(config);
+ luceneLabelTokenizer = new LuceneLabelTokenizer();
+ luceneLabelTokenizer.activate(cc);
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testNullLabel(){
+ luceneLabelTokenizer.tokenize(null, "zh");
+ }
+ @Test
+ public void testNullLanguate(){
+ Assert.assertNull(luceneLabelTokenizer.tokenize("test", null));
+ }
+ @Test
+ public void testUnsupportedLanguage(){
+ Assert.assertNull(luceneLabelTokenizer.tokenize("test", "de"));
+ }
+ @Test
+ public void testLuceneLabelTokenizer(){
+ //As I do have no Idea of Chinese those test validate only results I
+ //was getting when testing. So this ensures only that the behavioure
+ //does not change
+ //BBC
+ String label = "è±å½å¹¿æå
¬å¸";
+ String[] expected = new String[]{"è±å½","广æ","å
¬å¸"};
+ String[] tokens = luceneLabelTokenizer.tokenize(label, "zh");
+ Assert.assertNotNull(tokens);
+ Assert.assertArrayEquals(expected, tokens);
+ //Yellow Sea (one word??)
+ label = "é»æµ·";
+ expected = new String[]{"é»æµ·"};
+ tokens = luceneLabelTokenizer.tokenize(label, "zh");
+ Assert.assertNotNull(tokens);
+ Assert.assertArrayEquals(expected, tokens);
+ //Barack Obama
+ label = "è´æå
·奥巴马";
+ expected = new String[]{"è´","æ","å
","·","奥","巴马"};
+ tokens = luceneLabelTokenizer.tokenize(label, "zh");
+ Assert.assertNotNull(tokens);
+ Assert.assertArrayEquals(expected, tokens);
+ }
+ @Test
+ public void testEmptyLabel(){
+ String[] tokens = luceneLabelTokenizer.tokenize("", "zh");
+ Assert.assertNotNull(tokens);
+ Assert.assertTrue(tokens.length == 0);
+ }
+
+ @AfterClass
+ public static void close(){
+ luceneLabelTokenizer.deactivate(null);
+ }
+}