You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/11 14:19:02 UTC
svn commit: r1455131 [2/7] - in /stanbol/branches/stanbol-solr4: commons/ commons/frameworkfragment/ commons/solr/core/ commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/ commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/uti...

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config Mon Mar 11 13:18:59 2013
@@ -1,5 +1,5 @@
 enhancer.engines.entitylinking.labeltokenizer.languages=["zh"]
 enhancer.engine.linking.labeltokenizer.lucene.charFilterFactory=""
-enhancer.engine.linking.labeltokenizer.lucene.tokenizerFactory="org.apache.solr.analysis.SmartChineseSentenceTokenizerFactory"
-enhancer.engine.linking.labeltokenizer.lucene.tokenFilterFactory=["org.apache.solr.analysis.SmartChineseWordTokenFilterFactory"]
+enhancer.engine.linking.labeltokenizer.lucene.tokenizerFactory="org.apache.lucene.analysis.cn.smart.SmartChineseSentenceTokenizerFactory"
+enhancer.engine.linking.labeltokenizer.lucene.tokenFilterFactory=["org.apache.lucene.analysis.cn.smart.SmartChineseWordTokenFilterFactory"]
 service.ranking=I"100"

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml Mon Mar 11 13:18:59 2013
@@ -56,6 +56,8 @@
     <!-- Chinese Label Tokenizer -->
     <module>labeltokenizer-smartcn</module> <!-- config for the lucene label tokenizer -->
     <module>labeltokenizer-paoding</module> <!-- implementation based on paoding -->
+    <!-- Japanese -->
+    <module>labeltokenizer-kuromoji</module> <!-- configuration based on kuromoji-->
   </modules>
 
   <profiles>

Modified: stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml Mon Mar 11 13:18:59 2013
@@ -151,7 +151,7 @@
      <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId>
-      <version>0.11.0</version>
+      <version>0.12.0-SNAPSHOT</version>
       <scope>test</scope>
     </dependency>
      <dependency>

Propchange: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Mar 11 13:18:59 2013
@@ -0,0 +1,7 @@
+.settings
+
+.classpath
+
+.project
+
+target

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml Mon Mar 11 13:18:59 2013
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  You under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>apache-stanbol-enhancement-engines</artifactId>
+    <version>0.10.1-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.enhancer.engines.kuromoji.nlp</artifactId>
+  <version>0.10.1-SNAPSHOT</version>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Enhancement Engine : Kuromoji NLP </name>
+  <description>NLP processing based on the Lucene Kuromoji module</description>
+
+  <inceptionYear>2013</inceptionYear>
+
+  <scm>
+    <connection>
+      scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/kuromoji-nlp
+    </connection>
+    <developerConnection>
+      scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/kuromoji-nlp
+    </developerConnection>
+    <url>http://stanbol.apache.org/</url>
+  </scm>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Import-Package>
+              org.apache.stanbol.enhancer.servicesapi; provide:=true; version="[0.10,0.12)",
+              org.apache.stanbol.enhancer.servicesapi.impl; provide:=true; version="[0.10,0.12)",
+              *
+            </Import-Package>
+            <Private-Package>
+              org.apache.stanbol.enhancer.engines.kuromoji.impl
+            </Private-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <!-- AL20 License -->
+            <exclude>src/license/THIRD-PARTY.properties</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+      <version>0.10.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.commons.solr.core</artifactId>
+      <version>0.12.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers-kuromoji</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+      <version>0.10.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.scr.annotations</artifactId>
+    </dependency>
+    <!-- for tests -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.test</artifactId>
+      <version>0.11.0-SNAPSHOT</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.core</artifactId>
+      <version>0.11.0-SNAPSHOT</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <scope>test</scope>
+    </dependency>    
+  </dependencies>
+
+</project>

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,526 @@
+package org.apache.stanbol.enhancer.engines.kuromoji;
+
+import org.apache.lucene.analysis.ja.util.ToStringUtil;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+
+/**
+ * Defines mappings of the String tags used by Kuromoji to the vocabulary used
+ * by the Stanbol NLP processing module
+ * @author Rupert Westenthaler
+ */
+public class Constants {
+
+    
+    /**
+     * set of part of speech tags as defined in the {@link ToStringUtil} class.
+     * Descriptions are taken from the 
+     * <a herf="http://lucene-gosen.googlecode.com/svn/trunk/example/stoptags_ja.txt">
+     * Gosen Pos Tag Documentation</a> as the Tag Set used by Kuromoji does 
+     * exactly match those used by Gosen.
+     */
+    public static final TagSet<PosTag> POS_TAG_SET = new TagSet<PosTag>("Kuromoji Japanese", "ja");
+    /**
+     * PosTags representing Named Entities of type Persons
+     */
+    public static final TagSet<NerTag> NER_TAG_SET = new TagSet<NerTag>("Kuromoji Japanese", "ja");
+    
+    static {
+         /**
+         *  noun: unclassified nouns
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©",LexicalCategory.Noun));
+        /**
+         *  noun-common: Common nouns or nouns where the sub-classification is undefined
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ä¸è¬",Pos.CommonNoun));
+        /**
+         *  noun-proper: Proper nouns where the sub-classification is undefined 
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©",Pos.ProperNoun));
+         /**
+         *  noun-proper-misc: miscellaneous proper nouns
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-ä¸è¬",Pos.ProperNoun));
+         /**
+         *  noun-proper-person: Personal names where the sub-classification is undefined
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-äººå",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-äººå",OntologicalClasses.DBPEDIA_PERSON));
+         /**
+         *  noun-proper-person-misc: names that cannot be divided into surname and 
+         *  given name; foreign names; names where the surname or given name is unknown.
+         *  e.g. ãå¸ã®æ¹
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-äººå-ä¸è¬",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-äººå-ä¸è¬",OntologicalClasses.DBPEDIA_PERSON));
+         /**
+         *  noun-proper-person-surname: Mainly Japanese surnames.
+         *  e.g. å±±ç°
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-äººå-å§",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-äººå-å§",OntologicalClasses.DBPEDIA_PERSON));
+         /**
+         *  noun-proper-person-given_name: Mainly Japanese given names.
+         *  e.g. å¤ªé
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-äººå-å",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-äººå-å",OntologicalClasses.DBPEDIA_PERSON));
+         /**
+         *  noun-proper-organization: Names representing organizations.
+         *  e.g. éç£ç, NHK
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-çµç¹",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-çµç¹",OntologicalClasses.DBPEDIA_ORGANISATION));
+         /**
+         *  noun-proper-place: Place names where the sub-classification is undefined
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-å°å",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-å°å",OntologicalClasses.DBPEDIA_PLACE));
+         /**
+         *  noun-proper-place-misc: Place names excluding countries.
+         *  e.g. ã¢ã¸ã¢, ãã«ã»ãã, äº¬é½
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-å°å-ä¸è¬",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-å°å-ä¸è¬",OntologicalClasses.DBPEDIA_PLACE));
+         /**
+         *  noun-proper-place-country: Country names. 
+         *  e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-å°å-å½",Pos.ProperNoun));
+        NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-å°å-å½",OntologicalClasses.DBPEDIA_PLACE));
+         /**
+         *  noun-pronoun: Pronouns where the sub-classification is undefined
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ä»£åè©",Pos.Pronoun));
+         /**
+         *  noun-pronoun-misc: miscellaneous pronouns: 
+         *  e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ä»£åè©-ä¸è¬",Pos.Pronoun));
+         /**
+         *  noun-pronoun-contraction: Spoken language contraction made by combining a 
+         *  pronoun and the particle 'wa'.
+         *  e.g. ããã, ããã, ãããã, ããã, ãããã 
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ä»£åè©-ç¸®ç´",Pos.Pronoun,Pos.Participle));
+         /**
+         *  noun-adverbial: Temporal nouns such as names of days or months that behave 
+         *  like adverbs. Nouns that represent amount or ratios and can be used adverbially,
+         *  e.g. éæ, ä¸æ, åå¾, å°é
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-å¯è©å¯è½",LexicalCategory.Adverb,Pos.CommonNoun));
+         /**
+         *  noun-verbal: Nouns that take arguments with case and can appear followed by 
+         *  'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã)
+         *  e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ãµå¤æ¥ç¶",Pos.VerbalNoun));
+         /**
+         *  noun-adjective-base: The base form of adjectives, words that appear before ãª ("na")
+         *  e.g. å¥åº·, å®æ, é§ç®, ã ã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-å½¢å®¹åè©èªå¹¹",LexicalCategory.Adjective,Pos.CommonNoun));
+         /**
+         *  noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ (å), æ°.
+         *  e.g. 0, 1, 2, ä½, æ°, å¹¾
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ°",Pos.CardinalNumber));
+         /**
+         *  noun-affix: noun affixes where the sub-classification is undefined
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-éèªç«",LexicalCategory.Noun));
+         /**
+         *  noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words that 
+         *  attach to the base form of inflectional words, words that cannot be classified 
+         *  into any of the other categories below. This category includes indefinite nouns.
+         *  e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, ç, ãã¨, äº, ãã¨, æ¯, ãã ã, æ¬¡ç¬¬, 
+         *       é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿, 
+         *       æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è, ãã, æ, ããã, æä»¥, ãã, è¨³,
+         *       ãã, å²ã, å², ã-å£èª/, ãã-å£èª/
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-ä¸è¬",LexicalCategory.Noun));
+         /**
+         *  noun-affix-adverbial: noun affixes that that can behave as adverbs.
+         *  e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, ä»¥å¤, ä»¥é, ä»¥å¾, ä»¥ä¸, ä»¥å, ä¸æ¹, ãã, 
+         *       ä¸, ãã¡, å, ãã, æã, ããã, éã, ãã, ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã, 
+         *       æä¸, ããã, èªä½, ãã³, åº¦, ãã, çº, ã¤ã©, é½åº¦, ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ, 
+         *       ã¨ãã, éç«¯, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾, 
+         *       å, ä¾, ã¿ãã, ç¢å
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-å¯è©å¯è½",LexicalCategory.Noun,LexicalCategory.Adverb));
+         /**
+         *  noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in school grammars 
+         *  with the stem ãã(ã ) ("you(da)").
+         *  e.g.  ãã, ãã, æ§ (ãã)
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-å©åè©èªå¹¹",Pos.VerbalNoun,Pos.AuxiliaryVerb));
+         /**  
+         *  noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+         *  connection form ãª (aux "da").
+         *  e.g. ã¿ãã, ãµã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-å½¢å®¹åè©èªå¹¹",LexicalCategory.Noun,LexicalCategory.Adjective));
+         /**
+         *  noun-special: special nouns where the sub-classification is undefined.
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ç¹æ®",LexicalCategory.Noun));
+         /**
+         *  noun-special-aux: The ããã  ("souda") stem form that is used for reporting news, is 
+         *  treated as å©åè© ("auxiliary verb") in school grammars, and attach to the base 
+         *  form of inflectional words.
+         *  e.g. ãã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ç¹æ®-å©åè©èªå¹¹",LexicalCategory.Noun));
+         /**
+         *  noun-suffix: noun suffixes where the sub-classification is undefined.
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾",LexicalCategory.Noun));
+         /**
+         *  noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect 
+         *  to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot be classified into
+         *  any of the other categories below. In general, this category is more inclusive than 
+         *  æ¥å°¾èª ("suffix") and is usually the last element in a compound noun.
+         *  e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, ããã¿, (ï½ãã) ã, æ¬¡ç¬¬, æ¸ (ã) ã¿,
+         *       ãã, (ã§ã)ã£ã, æ, è¦³, æ§, å¦, é¡, é¢, ç¨
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-ä¸è¬",LexicalCategory.Noun));
+         /**
+         *  noun-suffix-person: Suffixes that form nouns and attach to person names more often
+         *  than other nouns.
+         *  e.g. å, æ§, è
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-äººå",LexicalCategory.Noun));
+        NER_TAG_SET.addTag(new NerTag("åè©-æ¥å°¾-äººå",OntologicalClasses.DBPEDIA_PERSON));
+         /**
+         *  noun-suffix-place: Suffixes that form nouns and attach to place names more often 
+         *  than other nouns.
+         *  e.g. çº, å¸, ç
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å°å",LexicalCategory.Noun));
+        NER_TAG_SET.addTag(new NerTag("åè©-æ¥å°¾-å°å",OntologicalClasses.DBPEDIA_PLACE));
+         /**
+         *  noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that 
+         *  can appear before ã¹ã« ("suru").
+         *  e.g. å, è¦, åã, å¥ã, è½ã¡, è²·ã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-ãµå¤æ¥ç¶",Pos.VerbalNoun));
+         /**
+         *  noun-suffix-aux: The stem form of ããã  (æ§æ) that is used to indicate conditions, 
+         *  is treated as å©åè© ("auxiliary verb") in school grammars, and attach to the 
+         *  conjunctive form of inflectional words.
+         *  e.g. ãã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å©åè©èªå¹¹",Pos.VerbalNoun,Pos.AuxiliaryVerb));
+         /**
+         *  noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive 
+         *  form of inflectional words and appear before the copula ã  ("da").
+         *  e.g. ç, ã, ãã¡
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å½¢å®¹åè©èªå¹¹",LexicalCategory.Noun,LexicalCategory.Adjective));
+         /**
+         *  noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
+         *  e.g. å¾ (ã), ä»¥å¾, ä»¥é, ä»¥å, åå¾, ä¸, æ«, ä¸, æ (ã)
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å¯è©å¯è½",LexicalCategory.Noun,LexicalCategory.Adverb));
+         /**
+         *  noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category 
+         *  is more inclusive than å©æ°è© ("classifier") and includes common nouns that attach 
+         *  to numbers.
+         *  e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», æé, æå
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å©æ°è©",Pos.UnitNoun));
+         /**
+         *  noun-suffix-special: Special suffixes that mainly attach to inflecting words.
+         *  e.g. (æ¥½ã) ã, (èã) æ¹
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-ç¹æ®",Pos.CommonNoun));
+         /**
+         *  noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words 
+         *  together.
+         *  e.g. (æ¥æ¬) å¯¾ (ã¢ã¡ãªã«), å¯¾ (ã¢ã¡ãªã«), (3) å¯¾ (5), (å¥³åª) å¼ (ä¸»å©¦)
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥ç¶è©ç",LexicalCategory.Conjuction,Pos.CommonNoun));
+         /**
+         *  noun-verbal_aux: Nouns that attach to the conjunctive particle ã¦ ("te") and are 
+         *  semantically verb-like.
+         *  e.g. ããã, ãè¦§, å¾¡è¦§, é æ´
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-åè©éèªç«ç",Pos.VerbalNoun,Pos.AuxiliaryVerb));
+         /**
+         *  noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, 
+         *  dialects, English, etc. Currently, the only entry for åè© å¼ç¨æåå ("noun quotation") 
+         *  is ããã ("iwaku").
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-å¼ç¨æåå",LexicalCategory.Noun));
+         /**
+         *  noun-nai_adjective: Words that appear before the auxiliary verb ãªã ("nai") and
+         *  behave like an adjective.
+         *  e.g. ç³ãè¨³, ä»æ¹, ã¨ãã§ã, éã
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-ãã¤å½¢å®¹è©èªå¹¹",LexicalCategory.Noun,LexicalCategory.Adjective));
+         /**
+         *  prefix: unclassified prefixes
+         */
+        POS_TAG_SET.addTag(new PosTag("æ¥é è©"));
+         /**
+         *  prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) 
+         *  excluding numerical expressions.
+         *  e.g. ã (æ°´), æ (æ°), å (ç¤¾), æ (ï½æ°), é« (åè³ª), ã (è¦äº), ã (ç«æ´¾)
+         */
+        POS_TAG_SET.addTag(new PosTag("æ¥é è©-åè©æ¥ç¶",LexicalCategory.Noun));
+         /**
+         *  prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
+         *  in conjunctive form followed by ãªã/ãªãã/ãã ãã.
+         *  e.g. ã (èªã¿ãªãã), ã (åº§ã)
+         */
+        POS_TAG_SET.addTag(new PosTag("æ¥é è©-åè©æ¥ç¶",LexicalCategory.Verb));
+         /**
+         *  prefix-adjectival: Prefixes that attach to adjectives.
+         *  e.g. ã (å¯ãã§ããã), ãã« (ã§ãã)
+         */
+        POS_TAG_SET.addTag(new PosTag("æ¥é è©-å½¢å®¹è©æ¥ç¶",LexicalCategory.Adjective));
+         /**
+         *  prefix-numerical: Prefixes that attach to numerical expressions.
+         *  e.g. ç´, ããã, æ¯æ
+         */
+        POS_TAG_SET.addTag(new PosTag("æ¥é è©-æ°æ¥ç¶",Pos.Numeral));
+         /**
+         *  verb: unclassified verbs
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©",LexicalCategory.Verb));
+         /**
+         *  verb-main:
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-èªç«",Pos.MainVerb));
+         /**
+         *  verb-auxiliary:
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-éèªç«",Pos.AuxiliaryVerb));
+         /**
+         *  verb-suffix:
+         */
+        POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾",LexicalCategory.Verb));
+         /**
+         *  adjective: unclassified adjectives
+         */
+        POS_TAG_SET.addTag(new PosTag("å½¢å®¹è©",LexicalCategory.Adjective));
+         /**
+         *  adjective-main:
+         */
+        POS_TAG_SET.addTag(new PosTag("å½¢å®¹è©-èªç«",LexicalCategory.Adjective));
+         /**
+         *  adjective-auxiliary:
+         */
+        POS_TAG_SET.addTag(new PosTag("å½¢å®¹è©-éèªç«",LexicalCategory.Adjective));
+         /**
+         *  adjective-suffix:
+         */
+        POS_TAG_SET.addTag(new PosTag("å½¢å®¹è©-æ¥å°¾",LexicalCategory.Adjective));
+         /**
+         *  adverb: unclassified adverbs
+         */
+        POS_TAG_SET.addTag(new PosTag("å¯è©",LexicalCategory.Adverb));
+         /**
+         *  adverb-misc: Words that can be segmented into one unit and where adnominal 
+         *  modification is not possible.
+         *  e.g. ãããããã, å¤å
+         */
+        POS_TAG_SET.addTag(new PosTag("å¯è©-ä¸è¬",LexicalCategory.Adverb));
+         /**
+         *  adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«, 
+         *  ãª, ãã, ã , etc.
+         *  e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã
+         */
+        POS_TAG_SET.addTag(new PosTag("å¯è©-å©è©é¡æ¥ç¶",LexicalCategory.Adverb,Pos.CoordinationParticle));
+         /**
+         *  adnominal: Words that only have noun-modifying forms.
+         *  e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, ä½ããã®, ããããª, ãããã, ãããã, ãããã, 
+         *       ã©ããã, ãããª, ãããª, ãããª, ã©ããª, å¤§ããª, å°ããª, ããããª, ã»ãã®, ãããã, 
+         *       ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ããã, å ããã, åãªã, ãããªã, æãããåã, äº¡ã
+         */
+        POS_TAG_SET.addTag(new PosTag("é£ä½è©",LexicalCategory.Adjective));
+         /**
+         *  conjunction: Conjunctions that can occur independently.
+         *  e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã
+         */
+        POS_TAG_SET.addTag(new PosTag("æ¥ç¶è©",LexicalCategory.Conjuction));
+         /**
+         *  particle: unclassified particles.
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©",Pos.Particle));
+         /**
+         *  particle-case: case particles where the subclassification is undefined.
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©",Pos.Particle));
+         /**
+         *  particle-case-misc: Case particles.
+         *  e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©-ä¸è¬",Pos.Particle));
+         /**
+         *  particle-case-quote: the "to" that appears after nouns, a personâs speech, 
+         *  quotation marks, expressions of decisions from a meeting, reasons, judgements,
+         *  conjectures, etc.
+         *  e.g. ( ã ) ã¨ (è¿°ã¹ã.), ( ã§ãã) ã¨ (ãã¦å·è¡ç¶äº...)
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©-å¼ç¨",Pos.Particle));
+         /**
+         *  particle-case-compound: Compounds of particles and verbs that mainly behave 
+         *  like case particles.
+         *  e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦,
+         *       ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã, 
+         *       ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã, 
+         *       ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦, 
+         *       ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦,
+         *       ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã, 
+         *       ã«ããã£ã¦, ã«ããã, ããã£ã¦, ãä»¥ã£ã¦, ãéã, ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã,
+         *       ã£ã¦-å£èª/, ã¡ãã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã (äºº)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©-é£èª",Pos.Particle));
+         /**
+         *  particle-conjunctive:
+         *  e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã, 
+         *       ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), ãããªã, (ããã) ãã(ãããªã)-å£èª/, 
+         *       (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ (ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-æ¥ç¶å©è©",Pos.ConjunctionPhrase,Pos.Particle));
+         /**
+         *  particle-dependency:
+         *  e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-ä¿å©è©",Pos.Particle));
+         /**
+         *  particle-adverbial:
+         *  e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (å¦æ ¡) ãã(ãããæµè¡ã£ã¦ãã)-å£èª/, 
+         *       (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, ãªã©, (ç§) ãªã (ã«), (åç) ãªãã (å¤§å«ã)-å£èª/,
+         *       (ç§) ãªãã, (åç) ãªãã¦ (å¤§å«ã)-å£èª/, ã®ã¿, ã ã, (ç§) ã ã£ã¦-å£èª/, ã ã«, 
+         *       (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), (ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/,
+         *       ã»ã©, ç¨, ã¾ã§, è¿, (èª°) ã (ã)([å©è©-æ ¼å©è©] ããã³ [å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã)
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-å¯å©è©",Pos.AdverbialParticiple));
+         /**
+         *  particle-interjective: particles with interjective grammatical roles.
+         *  e.g. (æ¾å³¶) ã
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-éæå©è©",Pos.Interjection,Pos.Particle));
+         /**
+         *  particle-coordinate:
+         *  e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-ä¸¦ç«å©è©",Pos.CoordinationParticle));
+         /**
+         *  particle-final:
+         *  e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã, 
+         *       ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-çµå©è©",Pos.Particle));
+         /**
+         *  particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is 
+         *  adverbial, conjunctive, or sentence final. For example:
+         *       (a) ãA ã B ãã. Ex:ã(å½åã§éç¨ãã) ã,(æµ·å¤ã§éç¨ãã) ã (.)ã
+         *       (b) Inside an adverb phrase. Ex:ã(å¹¸ãã¨ãã) ã (, æ»èã¯ããªãã£ã.)ã
+         *           ã(ç¥ããå±ãããã) ã (, è©¦é¨ã«åæ ¼ãã.)ã
+         *       (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã (ã®ããã«æ¯ãèã£ã.)ã
+         *  e.g. ã
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©",Pos.AdverbialParticiple,Pos.ConjunctionPhrase));
+         /**
+         *  particle-adnominalizer: The "no" that attaches to nouns and modifies 
+         *  non-inflectional words.
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-é£ä½å",Pos.Particle));
+         /**
+         *  particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs 
+         *  that are giongo, giseigo, or gitaigo.
+         *  e.g. ã«, ã¨
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-å¯è©å",Pos.Particle));
+         /**
+         *  particle-special: A particle that does not fit into one of the above classifications. 
+         *  This includes particles that are used in Tanka, Haiku, and other poetry.
+         *  e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) ã«ã(ãããã), (ä¿º) ã (å®¶)
+         */
+        POS_TAG_SET.addTag(new PosTag("å©è©-ç¹æ®",Pos.Participle));
+         /**
+         *  auxiliary-verb:
+         */
+        POS_TAG_SET.addTag(new PosTag("å©åè©",Pos.AuxiliaryVerb));
+         /**
+         *  interjection: Greetings and other exclamations.
+         *  e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã, 
+         *       ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã
+         */
+        POS_TAG_SET.addTag(new PosTag("æåè©",Pos.Interjection));
+         /**
+         *  symbol: unclassified Symbols.
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·",Pos.Symbol));
+         /**
+         *  symbol-misc: A general symbol not in one of the categories below.
+         *  e.g. [ââ@$ãâ+]
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·-ä¸è¬",Pos.Symbol));
+        /**
+        *  symbol-period: Periods and full stops.
+        *  e.g. [.ï¼ã]
+        */
+       POS_TAG_SET.addTag(new PosTag("è¨å·-å¥ç¹",Pos.Point));
+         /**
+         *  symbol-comma: Commas
+         *  e.g. [,ã]
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·-èªç¹",Pos.Comma));
+         /**
+         *  symbol-space: Full-width whitespace.
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·-ç©ºç½",Pos.Symbol));
+         /**
+         *  symbol-open_bracket:
+         *  e.g. [({ââãã]
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·-æ¬å¼§é",Pos.OpenBracket));
+         /**
+         *  symbol-close_bracket:
+         *  e.g. [)}ââããã]
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·-æ¬å¼§é",Pos.CloseBracket));
+         /**
+         *  symbol-alphabetic:
+         */
+        POS_TAG_SET.addTag(new PosTag("è¨å·-ã¢ã«ãã¡ããã",Pos.Symbol));
+         /**
+         *  other: unclassified other
+         */
+        POS_TAG_SET.addTag(new PosTag("ãã®ä»",Pos.Foreign));
+         /**
+         *  other-interjection: Words that are hard to classify as noun-suffixes or 
+         *  sentence-final particles.
+         *  e.g. (ã )ã¡
+         */
+        POS_TAG_SET.addTag(new PosTag("ãã®ä»-éæ",LexicalCategory.Noun));
+         /**
+         *  filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
+         *  e.g. ãã®, ããã¨, ãã¨
+         */
+        POS_TAG_SET.addTag(new PosTag("ãã£ã©ã¼"));
+         /**
+         * * * * *
+         *  non-verbal: non-verbal sound.
+         */
+        POS_TAG_SET.addTag(new PosTag("éè¨èªé³"));
+         /**
+         *  fragment:
+         */
+        POS_TAG_SET.addTag(new PosTag("èªæç"));
+         /**
+         * * * * *
+         *  unknown: unknown part of speech.
+         */
+        POS_TAG_SET.addTag(new PosTag("æªç¥èª",Pos.Foreign));
+    }
+}

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import static org.apache.stanbol.enhancer.engines.kuromoji.Constants.NER_TAG_SET;
+import static org.apache.stanbol.enhancer.engines.kuromoji.Constants.POS_TAG_SET;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.MORPHO_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.input.CharSequenceReader;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory;
+import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory;
+import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory;
+import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory;
+import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
+import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.Version;
+import org.apache.sling.installer.core.impl.OsgiInstallerImpl;
+import org.apache.stanbol.commons.solr.utils.StanbolResourceLoader;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
+import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Sentence detection and word tokenizer for Chinese based on the Solr/Lucene
+ * smartcn analysers.
+ * 
+ * @author Rupert Westenthaler
+ */
+
+@Component(immediate = true, metatype = true, 
+    policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
+@Service
+@Properties(value={
+        @Property(name= EnhancementEngine.PROPERTY_NAME,value="kuromoji-token"),
+        @Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0
+})
+public class KuromojiNlpEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements ServiceProperties {
+
+    private static final Version LUCENE_VERSION = Version.LUCENE_41;
+    private static final String TOKENIZER_MODE = "search"; //normal, extended
+    private static final Map<String,Object> SERVICE_PROPERTIES;
+    private static final Map<String,String> TOKENIZER_FACTORY_CONFIG = new HashMap<String,String>();
+	private static final Map<String, String> BASE_FORM_FILTER_CONFIG = new HashMap<String,String>();
+	private static final Map<String, String> POS_FILTER_CONFIG = new HashMap<String,String>();
+	private static final Map<String, String> STEMM_FILTER_CONFIG = new HashMap<String,String>();
+    static {
+        Map<String,Object> props = new HashMap<String,Object>();
+        props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, 
+            ServiceProperties.ORDERING_NLP_TOKENIZING);
+        props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, 
+            NlpProcessingRole.Tokenizing);
+        SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
+
+        TOKENIZER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+        TOKENIZER_FACTORY_CONFIG.put("mode",TOKENIZER_MODE);
+        //we want to have tokens for punctations
+        TOKENIZER_FACTORY_CONFIG.put("discardPunctuation", "false");
+        
+        BASE_FORM_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+        
+        POS_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+        POS_FILTER_CONFIG.put("tags", "nostoptags.txt");
+        POS_FILTER_CONFIG.put("enablePositionIncrements","true");
+        
+        STEMM_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+        STEMM_FILTER_CONFIG.put("minimumLength","4");
+    }
+
+
+    private static Logger log = LoggerFactory.getLogger(KuromojiNlpEngine.class);
+    
+    @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
+    protected ResourceLoader parentResourceLoader;
+
+    protected ResourceLoader resourceLoader;
+
+    //private MappingCharFilterFactory charFilterFactory;
+    private TokenizerFactory tokenizerFactory;
+    
+    private List<TokenFilterFactory> filterFactories = new ArrayList<TokenFilterFactory>();
+    
+    @Reference
+    protected AnalysedTextFactory analysedTextFactory;
+    
+    protected LiteralFactory lf = LiteralFactory.getInstance();
+    /**
+
+     * holds {@link PosTag}s that are not contained in the 
+     * {@link org.apache.stanbol.enhancer.engines.kuromoji.Constants#POS_TAG_SET}
+     */
+    private Map<String,PosTag> adhocTags = new HashMap<String,PosTag>();
+    
+    /**
+     * Indicate if this engine can enhance supplied ContentItem, and if it
+     * suggests enhancing it synchronously or asynchronously. The
+     * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+     * just a suggestion from the engine.
+     * <p/>
+     * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
+     * the content item, CANNOT_ENHANCE otherwise.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the introspecting process of the content item
+     *          fails
+     */
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        // check if content is present
+        Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
+        if(entry == null || entry.getValue() == null) {
+            return CANNOT_ENHANCE;
+        }
+
+        String language = getLanguage(this,ci,false);
+        if("ja".equals(language) || (language != null && language.startsWith("ja-"))) {
+            log.trace(" > can enhance ContentItem {} with language {}",ci,language);
+            return ENHANCE_ASYNC;
+        } else {
+            return CANNOT_ENHANCE;
+        }
+    }
+
+    /**
+     * Compute enhancements for supplied ContentItem. The results of the process
+     * are expected to be stored in the metadata of the content item.
+     * <p/>
+     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+     * <p/>
+     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
+     * stores it as a new part in the content item. The metadata is not changed.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the underlying process failed to work as
+     *          expected
+     */
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
+
+        String language = getLanguage(this,ci,false);
+        if(!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
+            throw new IllegalStateException("The detected language is NOT 'ja'! "
+                + "As this is also checked within the #canEnhance(..) method this "
+                + "indicates an Bug in the used EnhancementJobManager implementation. "
+                + "Please report this on the dev@apache.stanbol.org or create an "
+                + "JIRA issue about this.");
+        }
+        //start with the Tokenizer
+        TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
+        //build the analyzing chain by adding all TokenFilters
+        for(TokenFilterFactory filterFactory : filterFactories){
+            tokenStream = filterFactory.create(tokenStream);
+        }
+
+        //Try to extract sentences based on POS tags ...
+        int sentStartOffset = -1;
+        //NER data
+        List<NerData> nerList = new ArrayList<NerData>();
+        int nerSentIndex = 0; //the next index where the NerData.context need to be set
+        NerData ner = null;
+        OffsetAttribute offset = null;
+        try {
+        	tokenStream.reset(); //required with Solr 4
+            while (tokenStream.incrementToken()){
+                offset = tokenStream.addAttribute(OffsetAttribute.class);
+                Token token = at.addToken(offset.startOffset(), offset.endOffset());
+                //Get the POS attribute and init the PosTag
+                PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
+                PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
+                if(posTag == null){
+                    posTag = adhocTags.get(posAttr.getPartOfSpeech());
+                    if(posTag == null){
+                        posTag = new PosTag(posAttr.getPartOfSpeech());
+                        adhocTags.put(posAttr.getPartOfSpeech(), posTag);
+                        log.warn(" ... missing PosTag mapping for {}",posAttr.getPartOfSpeech());
+                    }
+                }
+                //Sentence detection by POS tag
+                if(sentStartOffset < 0){ //the last token was a sentence ending
+                	sentStartOffset = offset.startOffset();
+                }
+                if(posTag.hasPos(Pos.Point)) { 
+                    Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
+                    //add the sentence as context to the NerData instances
+                    while(nerSentIndex < nerList.size()){
+                        nerList.get(nerSentIndex).context = sent.getSpan();
+                        nerSentIndex++;
+                    }
+                    sentStartOffset = -1;
+                }
+                //POS
+                token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
+                //NER
+                NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
+                if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){
+                    //write NER annotation
+                    Chunk chunk = at.addChunk(ner.start, ner.end);
+                    chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
+                    //NOTE that the fise:TextAnnotation are written later based on the nerList
+                    //clean up
+                    ner = null;
+                }
+                if(nerTag != null){
+                    if(ner == null){
+                        ner = new NerData(nerTag, offset.startOffset());
+                        nerList.add(ner);
+                    }
+                    ner.end = offset.endOffset();
+                }
+                BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
+                MorphoFeatures morpho = null;
+                if(baseFormAttr != null && baseFormAttr.getBaseForm() != null){
+                	morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
+                	morpho.addPos(posTag); //and add the posTag
+                }
+                InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
+                inflectionAttr.getInflectionForm();
+                inflectionAttr.getInflectionType();
+                if(morpho != null){ //if present add the morpho
+                	token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
+                }
+            }
+            //we still need to write the last sentence
+            Sentence lastSent = null;
+            if(offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset){
+                lastSent = at.addSentence(sentStartOffset, offset.endOffset());
+            }
+            //and set the context off remaining named entities
+            while(nerSentIndex < nerList.size()){
+                if(lastSent != null){
+                    nerList.get(nerSentIndex).context = lastSent.getSpan();
+                } else { //no sentence detected
+                    nerList.get(nerSentIndex).context = at.getSpan();
+                }
+                nerSentIndex++;
+            }
+        } catch (IOException e) {
+            throw new EngineException(this, ci, "Exception while reading from "
+                + "AnalyzedText contentpart",e);
+        } finally {
+            try {
+                tokenStream.close();
+            } catch (IOException e) {/* ignore */}
+        }
+        //finally write the NER annotations to the metadata of the ContentItem
+        final MGraph metadata = ci.getMetadata();
+        ci.getLock().writeLock().lock();
+        try {
+            Language lang = new Language("ja");
+            for(NerData nerData : nerList){
+                UriRef ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(
+                    at.getSpan().substring(nerData.start, nerData.end),lang)));
+                metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
+                metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
+                metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
+                metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, 
+                    new PlainLiteralImpl(nerData.context, lang)));
+            }
+        } finally{
+            ci.getLock().writeLock().unlock();
+        }
+    }
+
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return SERVICE_PROPERTIES;
+    }
+    /**
+     * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
+     * CONFIG_LANGUAGES.
+     *
+     * @param ce the {@link org.osgi.service.component.ComponentContext}
+     */
+    @Activate
+    protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
+        log.info("activating smartcn tokenizing engine");
+        super.activate(ce);
+        //init the Solr ResourceLoader used for initialising the components
+        resourceLoader = new StanbolResourceLoader(parentResourceLoader);
+        tokenizerFactory = new JapaneseTokenizerFactory();
+        tokenizerFactory.init(TOKENIZER_FACTORY_CONFIG);
+        tokenizerFactory.setLuceneMatchVersion(LUCENE_VERSION);
+        ((ResourceLoaderAware) tokenizerFactory).inform(resourceLoader);
+        //base form filter
+        TokenFilterFactory baseFormFilterFactory =  new JapaneseBaseFormFilterFactory();
+        baseFormFilterFactory.init(BASE_FORM_FILTER_CONFIG);
+        baseFormFilterFactory.setLuceneMatchVersion(LUCENE_VERSION);
+        filterFactories.add(baseFormFilterFactory);
+        //POS filter
+        TokenFilterFactory posFilterFactory = new JapanesePartOfSpeechStopFilterFactory();
+        posFilterFactory.init(POS_FILTER_CONFIG);
+        posFilterFactory.setLuceneMatchVersion(LUCENE_VERSION);
+        ((ResourceLoaderAware) posFilterFactory).inform(resourceLoader);
+        filterFactories.add(posFilterFactory);
+        //Stemming
+        TokenFilterFactory stemmFilterFactory = new JapaneseKatakanaStemFilterFactory();
+        stemmFilterFactory.init(STEMM_FILTER_CONFIG);
+        stemmFilterFactory.setLuceneMatchVersion(LUCENE_VERSION);
+        filterFactories.add(stemmFilterFactory);
+    }
+    
+    @Deactivate
+    protected void deactivate(ComponentContext context) {
+    	tokenizerFactory = null;
+    	filterFactories.clear();
+    	filterFactories = null;
+        super.deactivate(context);
+    }
+
+    /**
+     * This is an internal helper class that avoids to execute sentences
+     * using the {@link SentenceTokenizer} twice.
+     * @author Rupert Westenthaler
+     *
+     */
+    protected final class AnalyzedTextSentenceTokenizer extends Tokenizer {
+        private final AnalysedText at;
+        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+        private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+        private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+        private Iterator<Sentence> sentences;
+        private Sentence sentence = null;
+
+        protected AnalyzedTextSentenceTokenizer(AnalysedText at) {
+            super(new StringReader(at.getText().toString()));
+            this.at = at;
+            sentences = at.getSentences();
+        }
+
+        @Override
+        public boolean incrementToken() throws IOException {
+            if(sentences.hasNext()){
+                sentence = sentences.next();
+                termAtt.setEmpty().append(sentence.getSpan());
+                offsetAtt.setOffset(sentence.getStart(),sentence.getEnd());
+                typeAtt.setType("sentence");
+                return true;
+            } else {
+                return false;
+            }
+        }
+
+        @Override
+        public void end() throws IOException {
+          // set final offset
+          offsetAtt.setOffset(at.getEnd(), at.getEnd());
+        }
+        @Override
+        public void reset() throws IOException {
+            super.reset();
+            sentences = at.getSentences();
+            termAtt.setEmpty();
+            offsetAtt.setOffset(0, 0);
+            typeAtt.setType(null);
+        }
+    }
+}

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,25 @@
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+
+/**
+ * Used as intermediate representation of NER annotations so that one needs
+ * not to obtain a write lock on the {@link ContentItem} for each detected 
+ * entity
+ * @author Rupert Westenthaler
+ *
+ */
+class NerData {
+    
+    protected final NerTag tag;
+    protected final int start;
+    protected int end;
+    protected String context;
+    
+    protected NerData(NerTag ner, int start){
+        this.tag = ner;
+        this.start = start;
+    }
+    
+}
\ No newline at end of file

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Mar 11 13:18:59 2013
@@ -0,0 +1,31 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+
+org.apache.stanbol.enhancer.engines.smartcn.impl.SmartcnTokenizerEngine.name=Apache \
+Stanbol Enhancer Engine: Smartcn Tokenizer
+org.apache.stanbol.enhancer.engines.opennlp.token.impl.OpenNlpTokenizerEngine.description=Enhancement \
+Engine that detect sentences and tokenizes Chinese text by using the Solr/Lucene \
+smartcn analyzers.
+
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt Mon Mar 11 13:18:59 2013
@@ -0,0 +1 @@
+# this file is loaded by the POS Filter Factory
\ No newline at end of file

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** DataFileProvider that looks in our class resources */
+public class ClasspathDataFileProvider implements DataFileProvider {
+
+    private final Logger log = LoggerFactory.getLogger(getClass());
+    /*
+     * NOTE: This path needs to be the same as path configured for the 
+     *   'org.apache.stanbol:org.apache.stanbol.commons.solr.extras.gosen'
+     *   bundle
+     */
+    public static final String RESOURCE_BASE_PATH = "datafiles/";
+    
+    private final String symbolicName;
+    
+    ClasspathDataFileProvider(String bundleSymbolicName) {
+        symbolicName = bundleSymbolicName;
+    }
+    
+    @Override
+    public InputStream getInputStream(String bundleSymbolicName,
+            String filename, Map<String, String> comments) 
+    throws IOException {
+        final URL dataFile = getDataFile(bundleSymbolicName, filename);
+        
+        // Returning null is fine - if we don't have the data file, another
+        // provider might supply it
+        return dataFile != null ? dataFile.openStream() : null;
+    }
+    @Override
+    public boolean isAvailable(String bundleSymbolicName, String filename, Map<String,String> comments) {
+        return getDataFile(bundleSymbolicName, filename) != null;
+    }
+    /**
+     * @param bundleSymbolicName
+     * @param filename
+     * @return
+     */
+    private URL getDataFile(String bundleSymbolicName, String filename) {
+        //If the symbolic name is not null check that is equals to the symbolic
+        //name used to create this classpath data file provider
+        if(bundleSymbolicName != null && !symbolicName.equals(bundleSymbolicName)) {
+            log.debug("Requested bundleSymbolicName {} does not match mine ({}), request ignored",
+                    bundleSymbolicName, symbolicName);
+            return null;
+        }
+        
+        // load default OpenNLP models from classpath (embedded in the defaultdata bundle)
+        final String resourcePath = RESOURCE_BASE_PATH + filename;
+        final URL dataFile = getClass().getClassLoader().getResource(resourcePath);
+        //log.debug("Resource {} found: {}", (in == null ? "NOT" : ""), resourcePath);
+        return dataFile;
+    }
+}

Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.Dictionary;
+import java.util.Hashtable;
+
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.BundleException;
+import org.osgi.framework.BundleListener;
+import org.osgi.framework.Filter;
+import org.osgi.framework.FrameworkListener;
+import org.osgi.framework.InvalidSyntaxException;
+import org.osgi.framework.ServiceListener;
+import org.osgi.framework.ServiceReference;
+import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.component.ComponentContext;
+import org.osgi.service.component.ComponentInstance;
+
+public class MockComponentContext implements ComponentContext {
+
+    protected final Dictionary<String, Object> properties;
+    protected final BundleContext bundleContext = new MockBundleContext();
+
+    public MockComponentContext() {
+        properties = new Hashtable<String, Object>();
+    }
+
+    public MockComponentContext(Dictionary<String, Object> properties) {
+        this.properties = properties;
+    }
+
+    public void disableComponent(String name) {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public void enableComponent(String name) {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public BundleContext getBundleContext() {
+        return bundleContext;
+    }
+
+    public ComponentInstance getComponentInstance() {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public Dictionary<String, Object> getProperties() {
+        return properties;
+    }
+
+    public ServiceReference getServiceReference() {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public Bundle getUsingBundle() {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public Object locateService(String name) {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public Object locateService(String name, ServiceReference reference) {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    public Object[] locateServices(String name) {
+        throw new UnsupportedOperationException("Mock implementation");
+    }
+
+    private static final class MockBundleContext implements BundleContext {
+        /**
+         * Used by the Engine to read System properties
+         */
+        @Override
+        public String getProperty(String key) {
+            return System.getProperty(key);
+        }
+
+        @Override
+        public Bundle getBundle() {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public Bundle installBundle(String location) throws BundleException {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public Bundle installBundle(String location, InputStream input) throws BundleException {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public Bundle getBundle(long id) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public Bundle[] getBundles() {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void addServiceListener(ServiceListener listener, String filter) throws InvalidSyntaxException {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void addServiceListener(ServiceListener listener) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void removeServiceListener(ServiceListener listener) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void addBundleListener(BundleListener listener) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void removeBundleListener(BundleListener listener) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void addFrameworkListener(FrameworkListener listener) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public void removeFrameworkListener(FrameworkListener listener) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public ServiceRegistration registerService(String[] clazzes, Object service, Dictionary properties) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public ServiceRegistration registerService(String clazz, Object service, Dictionary properties) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public ServiceReference[] getServiceReferences(String clazz, String filter) throws InvalidSyntaxException {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public ServiceReference[] getAllServiceReferences(String clazz, String filter) throws InvalidSyntaxException {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public ServiceReference getServiceReference(String clazz) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public Object getService(ServiceReference reference) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public boolean ungetService(ServiceReference reference) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public File getDataFile(String filename) {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+
+        @Override
+        public Filter createFilter(String filter) throws InvalidSyntaxException {
+            throw new UnsupportedOperationException("Mock implementation");
+        }
+        
+    }
+}