You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/11 14:19:02 UTC

svn commit: r1455131 [3/7] - in /stanbol/branches/stanbol-solr4: commons/ commons/frameworkfragment/ commons/solr/core/ commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/ commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/uti...

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,138 @@
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import java.io.IOException;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.stanbol.commons.solr.utils.DataFileResourceLoader;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+
+public class TestKuromojiNlpEngine {
+    public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";
+   
+    private static DataFileProvider dataFileProvider;
+
+    private static ContentItemFactory contentItemFactory;
+    
+    private static UriRef id = new UriRef("http://www.example.org/contentItem1");
+    /**
+     * Test text taken from the <a href ="http://ja.wikipedia.org/wiki/%E3%83%AD%E3%83%B3%E3%83%89%E3%83%B3">
+     * Japanese wikipedia side for London</a>.
+     */
+    private static String text = "ロンドンはイングランドおよびイギリスの首都であり、イギリスや欧州"+
+            "連合域内で最大の都市圏を形成している。ロンドンはテムズ川河畔に位置し、2,000年前のローマ帝国"+
+            "によるロンディニウム創建が都市の起源である。ロンディニウム当時の街の中心部は、現在のシティ・"+
+            "オブ・ロンドン(シティ)に当たる地域にあった。シティの市街壁内の面積は約1平方マイルあり、"+
+            "中世以来その範囲はほぼ変わっていない。少なくとも19世紀以降、「ロンドン」の名称はシティの市"+
+            "街壁を越えて開発が進んだシティ周辺地域をも含めて用いられている。 ロンドンは市街地の大部分は"+
+            "コナベーションにより形成されている。ロンドンを管轄するリージョンであるグレーター・ロンドンでは"+
+            "、選挙で選出された大ロンドン市長とロンドン議会により統治が行われている。";
+
+    private KuromojiNlpEngine engine;
+    
+    private ContentItem contentItem;
+    
+    @BeforeClass
+    public static void initDataFileProvicer(){
+        dataFileProvider = new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME);
+        contentItemFactory = InMemoryContentItemFactory.getInstance();
+    }
+    
+    @Before
+    public void setUpServices() throws IOException , ConfigurationException {
+        engine = new KuromojiNlpEngine();
+        //we need to set some fields that would otherwise be injected by the container
+        engine.parentResourceLoader = new DataFileResourceLoader(dataFileProvider);
+        engine.analysedTextFactory = AnalysedTextFactory.getDefaultInstance();
+        Dictionary<String,Object> config = new Hashtable<String,Object>();
+        config.put(EnhancementEngine.PROPERTY_NAME, "gosen-nlp");
+        engine.activate(new MockComponentContext(config));
+        contentItem = contentItemFactory.createContentItem(id, new StringSource(text));
+        //add an annotation that this is Japanese
+        contentItem.getMetadata().add(new TripleImpl(id, Properties.DC_LANGUAGE, 
+            new PlainLiteralImpl("ja")));
+    }
+    
+    @Test
+    public void testEngine() throws EngineException {
+        LiteralFactory lf = LiteralFactory.getInstance();
+        Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
+        engine.computeEnhancements(contentItem);
+        //assert the results
+        Map<UriRef,Resource> expected = new HashMap<UriRef,Resource>();
+        expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
+        expected.put(Properties.ENHANCER_EXTRACTED_FROM,contentItem.getUri());
+        Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(
+            contentItem.getMetadata(), text, expected));
+        AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
+        Assert.assertNotNull(at);
+        List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
+        Assert.assertNotNull(sentences);
+        Assert.assertEquals(7, sentences.size());
+        //TODO: values in the following arrays are based on the first run of the
+        // engine. So this is only to detect changes in results. It can not validate
+        // that the tokenization and NER detections are correct - sorry I do not
+        // speak Japanese ...
+        int[] expectedChunks = new int[]{ 5, 3, 1, 0, 1, 2, 4};
+        int[]  expectedTokens = new int[]{ 25, 25, 25, 24, 33, 17, 32};
+        int sentIndex = 0;
+        for(Sentence sent : sentences){
+            List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
+            Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
+            for(Chunk chunk : sentenceNer){
+                Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
+                Assert.assertNotNull(nerValue);
+                Assert.assertNotNull(nerValue.value().getType());
+            }
+            List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
+           Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
+            for(Token token : tokens){
+                Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+                Assert.assertNotNull(posValue);
+            }
+            sentIndex++;
+        }
+    }
+    
+
+    @After
+    public void cleanUpServices(){
+        if(engine != null){
+            engine.deactivate(null);
+        }
+        engine = null;
+    }
+    
+}

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml Mon Mar 11 13:18:59 2013
@@ -95,6 +95,17 @@
       <artifactId>org.apache.stanbol.commons.solr.extras.paoding</artifactId>
       <version>0.11.0</version>
     </dependency>
+    <!-- paoding does not support solr4 yet -->
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-core</artifactId>
+      <version>3.6.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers</artifactId>
+      <version>3.6.1</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java Mon Mar 11 13:18:59 2013
@@ -154,6 +154,7 @@ public class PaodingTokenizerEngine exte
         TokenStream ts = pa.tokenStream("dummy", new CharSequenceReader(at.getText()));
         int lastEnd = 0;
         try {
+        	ts.reset();
             while(ts.incrementToken()){
                 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
                 //when tokenizing labels we need to preserve all chars

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml Mon Mar 11 13:18:59 2013
@@ -64,6 +64,9 @@
     <module>paoding-token</module> <!-- tokenizing -->
     <module>nlp2rdf</module> <!-- converts AnalyzedText ContentPart to RDF -->
 
+    <!-- Japanese NLP processing -->
+    <module>kuromoji-nlp</module>
+
     <!-- RESTful NLP analyser service engine-->
     <module>restful-nlp</module> <!-- see STANBOL-893 -->
     <module>restful-langident</module> <!-- see STANBOL-895 -->

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml Mon Mar 11 13:18:59 2013
@@ -108,11 +108,15 @@
       <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
       <version>0.10.0</version>
     </dependency>
-    <dependency>
+    <dependency> <!-- for tracking and loading sentiment wordlists -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.commons.solr.core</artifactId>
+      <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
       <version>0.11.0</version>
     </dependency>
+    <dependency><!-- for stemming English words -->
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers-common</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.felix</groupId>
       <artifactId>org.apache.felix.scr.annotations</artifactId>

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java Mon Mar 11 13:18:59 2013
@@ -166,7 +166,7 @@ public class SentiWordNet {
         private ReadWriteLock lock = new ReentrantReadWriteLock();
         private Map<String,Double> wordMap = new TreeMap<String,Double>();
 
-        private EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
+        private org.apache.lucene.analysis.en.EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
 
         protected SentiWordNetClassifierEN() {}
 

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml Mon Mar 11 13:18:59 2013
@@ -86,7 +86,7 @@
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.commons.solr.extras.smartcn</artifactId>
-      <version>0.11.0</version>
+      <version>0.12.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.stanbol</groupId>

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java Mon Mar 11 13:18:59 2013
@@ -20,6 +20,7 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
 
 import java.io.IOException;
+import java.io.StringReader;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -152,6 +153,7 @@ public class SmartcnSentenceEngine exten
         //first the sentences
         TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
         try {
+        	sentences.reset();
             while(sentences.incrementToken()){
                 OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                 Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
@@ -203,6 +205,7 @@ public class SmartcnSentenceEngine exten
         private Sentence sentence = null;
 
         protected AnalyzedTextSentenceTokenizer(AnalysedText at) {
+            super(new StringReader(at.getText().toString()));
             this.at = at;
             sentences = at.getSentences();
         }

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java Mon Mar 11 13:18:59 2013
@@ -20,6 +20,7 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
 
 import java.io.IOException;
+import java.io.StringReader;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -170,6 +171,7 @@ public class SmartcnTokenizerEngine exte
         //now the tokens
         TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
         try {
+        	tokens.reset();
             while(tokens.incrementToken()){
                 OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
                 Token t = at.addToken(offset.startOffset(), offset.endOffset());
@@ -219,6 +221,7 @@ public class SmartcnTokenizerEngine exte
         private Sentence sentence = null;
 
         protected AnalyzedTextSentenceTokenizer(AnalysedText at) {
+            super(new StringReader(at.getText().toString()));
             this.at = at;
             sentences = at.getSentences();
         }

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml Mon Mar 11 13:18:59 2013
@@ -218,7 +218,6 @@
     <dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpcore-osgi</artifactId>
-      <version>4.0.1</version>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -294,15 +293,16 @@
       <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
       <version>0.10.0</version>
     </dependency>
-    <dependency>
+    <!-- anyway transitive dependency of managed
+      <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.commons.solr.core</artifactId>
-      <version>0.11.0</version>
-    </dependency>
+      <version>0.12.0-SNAPSHOT</version>
+    </dependency> -->
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.commons.solr.managed</artifactId>
-      <version>0.11.0</version>
+      <version>0.12.0-SNAPSHOT</version>
     </dependency>    
   </dependencies>
 

Modified: stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java (original)
+++ stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java Mon Mar 11 13:18:59 2013
@@ -63,6 +63,7 @@ public class AssertEntityhubJson {
      * {@link QueryTestCase#getExpectedStatus()} is a 2xx status code.
      */
     public static void assertQueryResults(RequestExecutor re, QueryTestCase test) throws JSONException{
+    	log.debug("Assert Query Results for test {}",test.getContent());
         re.assertStatus(test.getExpectedStatus());
         re.assertContentType("application/json"); //currently only application/json is supported
         if(!test.expectsSuccess()){

Modified: stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java (original)
+++ stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java Mon Mar 11 13:18:59 2013
@@ -97,6 +97,14 @@ public abstract class EntityhubTestBase 
                             referencedSite));
                     }
                 }
+                //this ensures that all sites are initialized
+                for(String referencedSite : referencedSites){
+	                re = executor.execute(
+	                        builder.buildGetRequest("/entityhub/site/"+referencedSite +
+	                        		"/entity?id=urn:does:not:exist:f82js95xsig39s.23987")
+	                        .withHeader("Accept", "application/json"));
+	                re.assertStatus(404);
+                }
                 log.info("Entityhub services checked, all present");
                 return true;
             }