You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/11 14:19:02 UTC
svn commit: r1455131 [2/7] - in /stanbol/branches/stanbol-solr4: commons/
commons/frameworkfragment/ commons/solr/core/
commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/
commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/uti...
Modified: stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/labeltokenizer-smartcn/src/main/resources/config/org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.lucene.LuceneLabelTokenizer-smartcn.config Mon Mar 11 13:18:59 2013
@@ -1,5 +1,5 @@
enhancer.engines.entitylinking.labeltokenizer.languages=["zh"]
enhancer.engine.linking.labeltokenizer.lucene.charFilterFactory=""
-enhancer.engine.linking.labeltokenizer.lucene.tokenizerFactory="org.apache.solr.analysis.SmartChineseSentenceTokenizerFactory"
-enhancer.engine.linking.labeltokenizer.lucene.tokenFilterFactory=["org.apache.solr.analysis.SmartChineseWordTokenFilterFactory"]
+enhancer.engine.linking.labeltokenizer.lucene.tokenizerFactory="org.apache.lucene.analysis.cn.smart.SmartChineseSentenceTokenizerFactory"
+enhancer.engine.linking.labeltokenizer.lucene.tokenFilterFactory=["org.apache.lucene.analysis.cn.smart.SmartChineseWordTokenFilterFactory"]
service.ranking=I"100"
Modified: stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/entitylinking/pom.xml Mon Mar 11 13:18:59 2013
@@ -56,6 +56,8 @@
<!-- Chinese Label Tokenizer -->
<module>labeltokenizer-smartcn</module> <!-- config for the lucene label tokenizer -->
<module>labeltokenizer-paoding</module> <!-- implementation based on paoding -->
+ <!-- Japanese -->
+ <module>labeltokenizer-kuromoji</module> <!-- configuration based on kuromoji-->
</modules>
<profiles>
Modified: stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml (original)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/entitytagging/pom.xml Mon Mar 11 13:18:59 2013
@@ -151,7 +151,7 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId>
- <version>0.11.0</version>
+ <version>0.12.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
Propchange: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Mar 11 13:18:59 2013
@@ -0,0 +1,7 @@
+.settings
+
+.classpath
+
+.project
+
+target
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/pom.xml Mon Mar 11 13:18:59 2013
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>apache-stanbol-enhancement-engines</artifactId>
+ <version>0.10.1-SNAPSHOT</version>
+ <relativePath>..</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.kuromoji.nlp</artifactId>
+ <version>0.10.1-SNAPSHOT</version>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancement Engine : Kuromoji NLP </name>
+ <description>NLP processing based on the Lucene Kuromoji module</description>
+
+ <inceptionYear>2013</inceptionYear>
+
+ <scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/kuromoji-nlp
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/kuromoji-nlp
+ </developerConnection>
+ <url>http://stanbol.apache.org/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Import-Package>
+ org.apache.stanbol.enhancer.servicesapi; provide:=true; version="[0.10,0.12)",
+ org.apache.stanbol.enhancer.servicesapi.impl; provide:=true; version="[0.10,0.12)",
+ *
+ </Import-Package>
+ <Private-Package>
+ org.apache.stanbol.enhancer.engines.kuromoji.impl
+ </Private-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <!-- AL20 License -->
+ <exclude>src/license/THIRD-PARTY.properties</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>0.10.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.solr.core</artifactId>
+ <version>0.12.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-kuromoji</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ <!-- for tests -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.test</artifactId>
+ <version>0.11.0-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.core</artifactId>
+ <version>0.11.0-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/Constants.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,526 @@
+package org.apache.stanbol.enhancer.engines.kuromoji;
+
+import org.apache.lucene.analysis.ja.util.ToStringUtil;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+
+/**
+ * Defines mappings of the String tags used by Kuromoji to the vocabulary used
+ * by the Stanbol NLP processing module
+ * @author Rupert Westenthaler
+ */
+public class Constants {
+
+
+ /**
+ * set of part of speech tags as defined in the {@link ToStringUtil} class.
+ * Descriptions are taken from the
+ * <a herf="http://lucene-gosen.googlecode.com/svn/trunk/example/stoptags_ja.txt">
+ * Gosen Pos Tag Documentation</a> as the Tag Set used by Kuromoji does
+ * exactly match those used by Gosen.
+ */
+ public static final TagSet<PosTag> POS_TAG_SET = new TagSet<PosTag>("Kuromoji Japanese", "ja");
+ /**
+ * PosTags representing Named Entities of type Persons
+ */
+ public static final TagSet<NerTag> NER_TAG_SET = new TagSet<NerTag>("Kuromoji Japanese", "ja");
+
+ static {
+ /**
+ * noun: unclassified nouns
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©",LexicalCategory.Noun));
+ /**
+ * noun-common: Common nouns or nouns where the sub-classification is undefined
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-ä¸è¬",Pos.CommonNoun));
+ /**
+ * noun-proper: Proper nouns where the sub-classification is undefined
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©",Pos.ProperNoun));
+ /**
+ * noun-proper-misc: miscellaneous proper nouns
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-ä¸è¬",Pos.ProperNoun));
+ /**
+ * noun-proper-person: Personal names where the sub-classification is undefined
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-人å",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-人å",OntologicalClasses.DBPEDIA_PERSON));
+ /**
+ * noun-proper-person-misc: names that cannot be divided into surname and
+ * given name; foreign names; names where the surname or given name is unknown.
+ * e.g. ãå¸ã®æ¹
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-人å-ä¸è¬",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-人å-ä¸è¬",OntologicalClasses.DBPEDIA_PERSON));
+ /**
+ * noun-proper-person-surname: Mainly Japanese surnames.
+ * e.g. å±±ç°
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-人å-å§",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-人å-å§",OntologicalClasses.DBPEDIA_PERSON));
+ /**
+ * noun-proper-person-given_name: Mainly Japanese given names.
+ * e.g. 太é
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-人å-å",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-人å-å",OntologicalClasses.DBPEDIA_PERSON));
+ /**
+ * noun-proper-organization: Names representing organizations.
+ * e.g. éç£ç, NHK
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-çµç¹",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-çµç¹",OntologicalClasses.DBPEDIA_ORGANISATION));
+ /**
+ * noun-proper-place: Place names where the sub-classification is undefined
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-å°å",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-å°å",OntologicalClasses.DBPEDIA_PLACE));
+ /**
+ * noun-proper-place-misc: Place names excluding countries.
+ * e.g. ã¢ã¸ã¢, ãã«ã»ãã, 京é½
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-å°å-ä¸è¬",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-å°å-ä¸è¬",OntologicalClasses.DBPEDIA_PLACE));
+ /**
+ * noun-proper-place-country: Country names.
+ * e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åºæåè©-å°å-å½",Pos.ProperNoun));
+ NER_TAG_SET.addTag(new NerTag("åè©-åºæåè©-å°å-å½",OntologicalClasses.DBPEDIA_PLACE));
+ /**
+ * noun-pronoun: Pronouns where the sub-classification is undefined
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-代åè©",Pos.Pronoun));
+ /**
+ * noun-pronoun-misc: miscellaneous pronouns:
+ * e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-代åè©-ä¸è¬",Pos.Pronoun));
+ /**
+ * noun-pronoun-contraction: Spoken language contraction made by combining a
+ * pronoun and the particle 'wa'.
+ * e.g. ããã, ããã, ãããã, ããã, ãããã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-代åè©-縮ç´",Pos.Pronoun,Pos.Participle));
+ /**
+ * noun-adverbial: Temporal nouns such as names of days or months that behave
+ * like adverbs. Nouns that represent amount or ratios and can be used adverbially,
+ * e.g. éæ, ä¸æ, åå¾, å°é
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-å¯è©å¯è½",LexicalCategory.Adverb,Pos.CommonNoun));
+ /**
+ * noun-verbal: Nouns that take arguments with case and can appear followed by
+ * 'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã)
+ * e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-ãµå¤æ¥ç¶",Pos.VerbalNoun));
+ /**
+ * noun-adjective-base: The base form of adjectives, words that appear before 㪠("na")
+ * e.g. å¥åº·, å®æ, é§ç®, ã ã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-形容åè©èªå¹¹",LexicalCategory.Adjective,Pos.CommonNoun));
+ /**
+ * noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ (å), æ°.
+ * e.g. 0, 1, 2, ä½, æ°, å¹¾
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ°",Pos.CardinalNumber));
+ /**
+ * noun-affix: noun affixes where the sub-classification is undefined
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-éèªç«",LexicalCategory.Noun));
+ /**
+ * noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words that
+ * attach to the base form of inflectional words, words that cannot be classified
+ * into any of the other categories below. This category includes indefinite nouns.
+ * e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, ç, ãã¨, äº, ãã¨, æ¯, ãã ã, 次第,
+ * é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿,
+ * æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è
, ãã, æ
, ããã, æ以, ãã, 訳,
+ * ãã, å²ã, å², ã-å£èª/, ãã-å£èª/
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-ä¸è¬",LexicalCategory.Noun));
+ /**
+ * noun-affix-adverbial: noun affixes that that can behave as adverbs.
+ * e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, 以å¤, 以é, 以å¾, 以ä¸, 以å, ä¸æ¹, ãã,
+ * ä¸, ãã¡, å
, ãã, æã, ããã, éã, ãã, ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã,
+ * æä¸, ããã, èªä½, ãã³, 度, ãã, çº, ã¤ã©, é½åº¦, ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ,
+ * ã¨ãã, é端, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾,
+ * å, ä¾, ã¿ãã, ç¢å
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-å¯è©å¯è½",LexicalCategory.Noun,LexicalCategory.Adverb));
+ /**
+ * noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in school grammars
+ * with the stem ãã(ã ) ("you(da)").
+ * e.g. ãã, ãã, æ§ (ãã)
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-å©åè©èªå¹¹",Pos.VerbalNoun,Pos.AuxiliaryVerb));
+ /**
+ * noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+ * connection form 㪠(aux "da").
+ * e.g. ã¿ãã, ãµã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-éèªç«-形容åè©èªå¹¹",LexicalCategory.Noun,LexicalCategory.Adjective));
+ /**
+ * noun-special: special nouns where the sub-classification is undefined.
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-ç¹æ®",LexicalCategory.Noun));
+ /**
+ * noun-special-aux: The ããã ("souda") stem form that is used for reporting news, is
+ * treated as å©åè© ("auxiliary verb") in school grammars, and attach to the base
+ * form of inflectional words.
+ * e.g. ãã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-ç¹æ®-å©åè©èªå¹¹",LexicalCategory.Noun));
+ /**
+ * noun-suffix: noun suffixes where the sub-classification is undefined.
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾",LexicalCategory.Noun));
+ /**
+ * noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
+ * to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot be classified into
+ * any of the other categories below. In general, this category is more inclusive than
+ * æ¥å°¾èª ("suffix") and is usually the last element in a compound noun.
+ * e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, ããã¿, (ï½ãã) ã, 次第, æ¸ (ã) ã¿,
+ * ãã, (ã§ã)ã£ã, æ, 観, æ§, å¦, é¡, é¢, ç¨
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-ä¸è¬",LexicalCategory.Noun));
+ /**
+ * noun-suffix-person: Suffixes that form nouns and attach to person names more often
+ * than other nouns.
+ * e.g. å, æ§, è
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-人å",LexicalCategory.Noun));
+ NER_TAG_SET.addTag(new NerTag("åè©-æ¥å°¾-人å",OntologicalClasses.DBPEDIA_PERSON));
+ /**
+ * noun-suffix-place: Suffixes that form nouns and attach to place names more often
+ * than other nouns.
+ * e.g. çº, å¸, ç
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å°å",LexicalCategory.Noun));
+ NER_TAG_SET.addTag(new NerTag("åè©-æ¥å°¾-å°å",OntologicalClasses.DBPEDIA_PLACE));
+ /**
+ * noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
+ * can appear before ã¹ã« ("suru").
+ * e.g. å, è¦, åã, å
¥ã, è½ã¡, è²·ã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-ãµå¤æ¥ç¶",Pos.VerbalNoun));
+ /**
+ * noun-suffix-aux: The stem form of ããã (æ§æ
) that is used to indicate conditions,
+ * is treated as å©åè© ("auxiliary verb") in school grammars, and attach to the
+ * conjunctive form of inflectional words.
+ * e.g. ãã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å©åè©èªå¹¹",Pos.VerbalNoun,Pos.AuxiliaryVerb));
+ /**
+ * noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive
+ * form of inflectional words and appear before the copula ã ("da").
+ * e.g. ç, ã, ãã¡
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-形容åè©èªå¹¹",LexicalCategory.Noun,LexicalCategory.Adjective));
+ /**
+ * noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
+ * e.g. å¾ (ã), 以å¾, 以é, 以å, åå¾, ä¸, æ«, ä¸, æ (ã)
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å¯è©å¯è½",LexicalCategory.Noun,LexicalCategory.Adverb));
+ /**
+ * noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category
+ * is more inclusive than å©æ°è© ("classifier") and includes common nouns that attach
+ * to numbers.
+ * e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», æé, æå
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-å©æ°è©",Pos.UnitNoun));
+ /**
+ * noun-suffix-special: Special suffixes that mainly attach to inflecting words.
+ * e.g. (楽ã) ã, (èã) æ¹
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾-ç¹æ®",Pos.CommonNoun));
+ /**
+ * noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words
+ * together.
+ * e.g. (æ¥æ¬) 対 (ã¢ã¡ãªã«), 対 (ã¢ã¡ãªã«), (3) 対 (5), (女åª) å
¼ (主婦)
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥ç¶è©ç",LexicalCategory.Conjuction,Pos.CommonNoun));
+ /**
+ * noun-verbal_aux: Nouns that attach to the conjunctive particle 㦠("te") and are
+ * semantically verb-like.
+ * e.g. ããã, ã覧, 御覧, é æ´
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-åè©éèªç«ç",Pos.VerbalNoun,Pos.AuxiliaryVerb));
+ /**
+ * noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry,
+ * dialects, English, etc. Currently, the only entry for åè© å¼ç¨æåå ("noun quotation")
+ * is ããã ("iwaku").
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-å¼ç¨æåå",LexicalCategory.Noun));
+ /**
+ * noun-nai_adjective: Words that appear before the auxiliary verb ãªã ("nai") and
+ * behave like an adjective.
+ * e.g. ç³ã訳, ä»æ¹, ã¨ãã§ã, éã
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-ãã¤å½¢å®¹è©èªå¹¹",LexicalCategory.Noun,LexicalCategory.Adjective));
+ /**
+ * prefix: unclassified prefixes
+ */
+ POS_TAG_SET.addTag(new PosTag("æ¥é è©"));
+ /**
+ * prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
+ * excluding numerical expressions.
+ * e.g. ã (æ°´), æ (æ°), å (社), æ
(ï½æ°), é« (å質), ã (è¦äº), ã (ç«æ´¾)
+ */
+ POS_TAG_SET.addTag(new PosTag("æ¥é è©-åè©æ¥ç¶",LexicalCategory.Noun));
+ /**
+ * prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
+ * in conjunctive form followed by ãªã/ãªãã/ãã ãã.
+ * e.g. ã (èªã¿ãªãã), ã (座ã)
+ */
+ POS_TAG_SET.addTag(new PosTag("æ¥é è©-åè©æ¥ç¶",LexicalCategory.Verb));
+ /**
+ * prefix-adjectival: Prefixes that attach to adjectives.
+ * e.g. ã (å¯ãã§ããã), ãã« (ã§ãã)
+ */
+ POS_TAG_SET.addTag(new PosTag("æ¥é è©-形容è©æ¥ç¶",LexicalCategory.Adjective));
+ /**
+ * prefix-numerical: Prefixes that attach to numerical expressions.
+ * e.g. ç´, ããã, æ¯æ
+ */
+ POS_TAG_SET.addTag(new PosTag("æ¥é è©-æ°æ¥ç¶",Pos.Numeral));
+ /**
+ * verb: unclassified verbs
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©",LexicalCategory.Verb));
+ /**
+ * verb-main:
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-èªç«",Pos.MainVerb));
+ /**
+ * verb-auxiliary:
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-éèªç«",Pos.AuxiliaryVerb));
+ /**
+ * verb-suffix:
+ */
+ POS_TAG_SET.addTag(new PosTag("åè©-æ¥å°¾",LexicalCategory.Verb));
+ /**
+ * adjective: unclassified adjectives
+ */
+ POS_TAG_SET.addTag(new PosTag("形容è©",LexicalCategory.Adjective));
+ /**
+ * adjective-main:
+ */
+ POS_TAG_SET.addTag(new PosTag("形容è©-èªç«",LexicalCategory.Adjective));
+ /**
+ * adjective-auxiliary:
+ */
+ POS_TAG_SET.addTag(new PosTag("形容è©-éèªç«",LexicalCategory.Adjective));
+ /**
+ * adjective-suffix:
+ */
+ POS_TAG_SET.addTag(new PosTag("形容è©-æ¥å°¾",LexicalCategory.Adjective));
+ /**
+ * adverb: unclassified adverbs
+ */
+ POS_TAG_SET.addTag(new PosTag("å¯è©",LexicalCategory.Adverb));
+ /**
+ * adverb-misc: Words that can be segmented into one unit and where adnominal
+ * modification is not possible.
+ * e.g. ãããããã, å¤å
+ */
+ POS_TAG_SET.addTag(new PosTag("å¯è©-ä¸è¬",LexicalCategory.Adverb));
+ /**
+ * adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«,
+ * ãª, ãã, ã , etc.
+ * e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã
+ */
+ POS_TAG_SET.addTag(new PosTag("å¯è©-å©è©é¡æ¥ç¶",LexicalCategory.Adverb,Pos.CoordinationParticle));
+ /**
+ * adnominal: Words that only have noun-modifying forms.
+ * e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, ä½ããã®, ããããª, ãããã, ãããã, ãããã,
+ * ã©ããã, ãããª, ãããª, ãããª, ã©ããª, 大ããª, å°ããª, ããããª, ã»ãã®, ãããã,
+ * ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ã
ãã, å ã
ãã, åãªã, ãããªã, æãããåã, 亡ã
+ */
+ POS_TAG_SET.addTag(new PosTag("é£ä½è©",LexicalCategory.Adjective));
+ /**
+ * conjunction: Conjunctions that can occur independently.
+ * e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã
+ */
+ POS_TAG_SET.addTag(new PosTag("æ¥ç¶è©",LexicalCategory.Conjuction));
+ /**
+ * particle: unclassified particles.
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©",Pos.Particle));
+ /**
+ * particle-case: case particles where the subclassification is undefined.
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©",Pos.Particle));
+ /**
+ * particle-case-misc: Case particles.
+ * e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©-ä¸è¬",Pos.Particle));
+ /**
+ * particle-case-quote: the "to" that appears after nouns, a personâs speech,
+ * quotation marks, expressions of decisions from a meeting, reasons, judgements,
+ * conjectures, etc.
+ * e.g. ( ã ) 㨠(è¿°ã¹ã.), ( ã§ãã) 㨠(ãã¦å·è¡ç¶äº...)
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©-å¼ç¨",Pos.Particle));
+ /**
+ * particle-case-compound: Compounds of particles and verbs that mainly behave
+ * like case particles.
+ * e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å
±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦,
+ * ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã,
+ * ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã,
+ * ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦,
+ * ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦,
+ * ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã,
+ * ã«ããã£ã¦, ã«ããã, ããã£ã¦, ã以ã£ã¦, ãéã, ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã,
+ * ã£ã¦-å£èª/, ã¡ã
ã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã (人)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-æ ¼å©è©-é£èª",Pos.Particle));
+ /**
+ * particle-conjunctive:
+ * e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã,
+ * ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), ãããªã, (ããã) ãã(ãããªã)-å£èª/,
+ * (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ (ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-æ¥ç¶å©è©",Pos.ConjunctionPhrase,Pos.Particle));
+ /**
+ * particle-dependency:
+ * e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-ä¿å©è©",Pos.Particle));
+ /**
+ * particle-adverbial:
+ * e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (å¦æ ¡) ãã(ãããæµè¡ã£ã¦ãã)-å£èª/,
+ * (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, ãªã©, (ç§) ãªã (ã«), (å
ç) ãªãã (大å«ã)-å£èª/,
+ * (ç§) ãªãã, (å
ç) ãªã㦠(大å«ã)-å£èª/, ã®ã¿, ã ã, (ç§) ã ã£ã¦-å£èª/, ã ã«,
+ * (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), (ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/,
+ * ã»ã©, ç¨, ã¾ã§, è¿, (誰) ã (ã)([å©è©-æ ¼å©è©] ããã³ [å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã)
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-å¯å©è©",Pos.AdverbialParticiple));
+ /**
+ * particle-interjective: particles with interjective grammatical roles.
+ * e.g. (æ¾å³¶) ã
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-éæå©è©",Pos.Interjection,Pos.Particle));
+ /**
+ * particle-coordinate:
+ * e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-並ç«å©è©",Pos.CoordinationParticle));
+ /**
+ * particle-final:
+ * e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã,
+ * ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-çµå©è©",Pos.Particle));
+ /**
+ * particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is
+ * adverbial, conjunctive, or sentence final. For example:
+ * (a) ãA ã B ãã. Ex:ã(å½å
ã§éç¨ãã) ã,(æµ·å¤ã§éç¨ãã) ã (.)ã
+ * (b) Inside an adverb phrase. Ex:ã(幸ãã¨ãã) ã (, æ»è
ã¯ããªãã£ã.)ã
+ * ã(ç¥ããå±ãããã) ã (, 試é¨ã«åæ ¼ãã.)ã
+ * (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã (ã®ããã«æ¯ãèã£ã.)ã
+ * e.g. ã
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©",Pos.AdverbialParticiple,Pos.ConjunctionPhrase));
+ /**
+ * particle-adnominalizer: The "no" that attaches to nouns and modifies
+ * non-inflectional words.
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-é£ä½å",Pos.Particle));
+ /**
+ * particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs
+ * that are giongo, giseigo, or gitaigo.
+ * e.g. ã«, ã¨
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-å¯è©å",Pos.Particle));
+ /**
+ * particle-special: A particle that does not fit into one of the above classifications.
+ * This includes particles that are used in Tanka, Haiku, and other poetry.
+ * e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) ã«ã(ãããã), (俺) ã (家)
+ */
+ POS_TAG_SET.addTag(new PosTag("å©è©-ç¹æ®",Pos.Participle));
+ /**
+ * auxiliary-verb:
+ */
+ POS_TAG_SET.addTag(new PosTag("å©åè©",Pos.AuxiliaryVerb));
+ /**
+ * interjection: Greetings and other exclamations.
+ * e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã,
+ * ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã
+ */
+ POS_TAG_SET.addTag(new PosTag("æåè©",Pos.Interjection));
+ /**
+ * symbol: unclassified Symbols.
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·",Pos.Symbol));
+ /**
+ * symbol-misc: A general symbol not in one of the categories below.
+ * e.g. [ââ@$ãâ+]
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-ä¸è¬",Pos.Symbol));
+ /**
+ * symbol-period: Periods and full stops.
+ * e.g. [.ï¼ã]
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-å¥ç¹",Pos.Point));
+ /**
+ * symbol-comma: Commas
+ * e.g. [,ã]
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-èªç¹",Pos.Comma));
+ /**
+ * symbol-space: Full-width whitespace.
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-空ç½",Pos.Symbol));
+ /**
+ * symbol-open_bracket:
+ * e.g. [({ââãã]
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-æ¬å¼§é",Pos.OpenBracket));
+ /**
+ * symbol-close_bracket:
+ * e.g. [)}ââããã]
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-æ¬å¼§é",Pos.CloseBracket));
+ /**
+ * symbol-alphabetic:
+ */
+ POS_TAG_SET.addTag(new PosTag("è¨å·-ã¢ã«ãã¡ããã",Pos.Symbol));
+ /**
+ * other: unclassified other
+ */
+ POS_TAG_SET.addTag(new PosTag("ãã®ä»",Pos.Foreign));
+ /**
+ * other-interjection: Words that are hard to classify as noun-suffixes or
+ * sentence-final particles.
+ * e.g. (ã )ã¡
+ */
+ POS_TAG_SET.addTag(new PosTag("ãã®ä»-éæ",LexicalCategory.Noun));
+ /**
+ * filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
+ * e.g. ãã®, ããã¨, ãã¨
+ */
+ POS_TAG_SET.addTag(new PosTag("ãã£ã©ã¼"));
+ /**
+ * * * * *
+ * non-verbal: non-verbal sound.
+ */
+ POS_TAG_SET.addTag(new PosTag("éè¨èªé³"));
+ /**
+ * fragment:
+ */
+ POS_TAG_SET.addTag(new PosTag("èªæç"));
+ /**
+ * * * * *
+ * unknown: unknown part of speech.
+ */
+ POS_TAG_SET.addTag(new PosTag("æªç¥èª",Pos.Foreign));
+ }
+}
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/KuromojiNlpEngine.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import static org.apache.stanbol.enhancer.engines.kuromoji.Constants.NER_TAG_SET;
+import static org.apache.stanbol.enhancer.engines.kuromoji.Constants.POS_TAG_SET;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.MORPHO_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.input.CharSequenceReader;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory;
+import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory;
+import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory;
+import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory;
+import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
+import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.Version;
+import org.apache.sling.installer.core.impl.OsgiInstallerImpl;
+import org.apache.stanbol.commons.solr.utils.StanbolResourceLoader;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
+import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Sentence detection and word tokenizer for Chinese based on the Solr/Lucene
+ * smartcn analysers.
+ *
+ * @author Rupert Westenthaler
+ */
+
+@Component(immediate = true, metatype = true,
+ policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
+@Service
+@Properties(value={
+ @Property(name= EnhancementEngine.PROPERTY_NAME,value="kuromoji-token"),
+ @Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0
+})
+public class KuromojiNlpEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements ServiceProperties {
+
+ private static final Version LUCENE_VERSION = Version.LUCENE_41;
+ private static final String TOKENIZER_MODE = "search"; //normal, extended
+ private static final Map<String,Object> SERVICE_PROPERTIES;
+ private static final Map<String,String> TOKENIZER_FACTORY_CONFIG = new HashMap<String,String>();
+ private static final Map<String, String> BASE_FORM_FILTER_CONFIG = new HashMap<String,String>();
+ private static final Map<String, String> POS_FILTER_CONFIG = new HashMap<String,String>();
+ private static final Map<String, String> STEMM_FILTER_CONFIG = new HashMap<String,String>();
+ static {
+ Map<String,Object> props = new HashMap<String,Object>();
+ props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
+ ServiceProperties.ORDERING_NLP_TOKENIZING);
+ props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
+ NlpProcessingRole.Tokenizing);
+ SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
+
+ TOKENIZER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+ TOKENIZER_FACTORY_CONFIG.put("mode",TOKENIZER_MODE);
+ //we want to have tokens for punctations
+ TOKENIZER_FACTORY_CONFIG.put("discardPunctuation", "false");
+
+ BASE_FORM_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+
+ POS_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+ POS_FILTER_CONFIG.put("tags", "nostoptags.txt");
+ POS_FILTER_CONFIG.put("enablePositionIncrements","true");
+
+ STEMM_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString());
+ STEMM_FILTER_CONFIG.put("minimumLength","4");
+ }
+
+
+ private static Logger log = LoggerFactory.getLogger(KuromojiNlpEngine.class);
+
+ @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
+ protected ResourceLoader parentResourceLoader;
+
+ protected ResourceLoader resourceLoader;
+
+ //private MappingCharFilterFactory charFilterFactory;
+ private TokenizerFactory tokenizerFactory;
+
+ private List<TokenFilterFactory> filterFactories = new ArrayList<TokenFilterFactory>();
+
+ @Reference
+ protected AnalysedTextFactory analysedTextFactory;
+
+ protected LiteralFactory lf = LiteralFactory.getInstance();
+ /**
+
+ * holds {@link PosTag}s that are not contained in the
+ * {@link org.apache.stanbol.enhancer.engines.kuromoji.Constants#POS_TAG_SET}
+ */
+ private Map<String,PosTag> adhocTags = new HashMap<String,PosTag>();
+
+ /**
+ * Indicate if this engine can enhance supplied ContentItem, and if it
+ * suggests enhancing it synchronously or asynchronously. The
+ * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+ * just a suggestion from the engine.
+ * <p/>
+ * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
+ * the content item, CANNOT_ENHANCE otherwise.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the introspecting process of the content item
+ * fails
+ */
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ // check if content is present
+ Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
+ if(entry == null || entry.getValue() == null) {
+ return CANNOT_ENHANCE;
+ }
+
+ String language = getLanguage(this,ci,false);
+ if("ja".equals(language) || (language != null && language.startsWith("ja-"))) {
+ log.trace(" > can enhance ContentItem {} with language {}",ci,language);
+ return ENHANCE_ASYNC;
+ } else {
+ return CANNOT_ENHANCE;
+ }
+ }
+
+ /**
+ * Compute enhancements for supplied ContentItem. The results of the process
+ * are expected to be stored in the metadata of the content item.
+ * <p/>
+ * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+ * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+ * <p/>
+ * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
+ * stores it as a new part in the content item. The metadata is not changed.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the underlying process failed to work as
+ * expected
+ */
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
+
+ String language = getLanguage(this,ci,false);
+ if(!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
+ throw new IllegalStateException("The detected language is NOT 'ja'! "
+ + "As this is also checked within the #canEnhance(..) method this "
+ + "indicates an Bug in the used EnhancementJobManager implementation. "
+ + "Please report this on the dev@apache.stanbol.org or create an "
+ + "JIRA issue about this.");
+ }
+ //start with the Tokenizer
+ TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
+ //build the analyzing chain by adding all TokenFilters
+ for(TokenFilterFactory filterFactory : filterFactories){
+ tokenStream = filterFactory.create(tokenStream);
+ }
+
+ //Try to extract sentences based on POS tags ...
+ int sentStartOffset = -1;
+ //NER data
+ List<NerData> nerList = new ArrayList<NerData>();
+ int nerSentIndex = 0; //the next index where the NerData.context need to be set
+ NerData ner = null;
+ OffsetAttribute offset = null;
+ try {
+ tokenStream.reset(); //required with Solr 4
+ while (tokenStream.incrementToken()){
+ offset = tokenStream.addAttribute(OffsetAttribute.class);
+ Token token = at.addToken(offset.startOffset(), offset.endOffset());
+ //Get the POS attribute and init the PosTag
+ PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
+ PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
+ if(posTag == null){
+ posTag = adhocTags.get(posAttr.getPartOfSpeech());
+ if(posTag == null){
+ posTag = new PosTag(posAttr.getPartOfSpeech());
+ adhocTags.put(posAttr.getPartOfSpeech(), posTag);
+ log.warn(" ... missing PosTag mapping for {}",posAttr.getPartOfSpeech());
+ }
+ }
+ //Sentence detection by POS tag
+ if(sentStartOffset < 0){ //the last token was a sentence ending
+ sentStartOffset = offset.startOffset();
+ }
+ if(posTag.hasPos(Pos.Point)) {
+ Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
+ //add the sentence as context to the NerData instances
+ while(nerSentIndex < nerList.size()){
+ nerList.get(nerSentIndex).context = sent.getSpan();
+ nerSentIndex++;
+ }
+ sentStartOffset = -1;
+ }
+ //POS
+ token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
+ //NER
+ NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
+ if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){
+ //write NER annotation
+ Chunk chunk = at.addChunk(ner.start, ner.end);
+ chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
+ //NOTE that the fise:TextAnnotation are written later based on the nerList
+ //clean up
+ ner = null;
+ }
+ if(nerTag != null){
+ if(ner == null){
+ ner = new NerData(nerTag, offset.startOffset());
+ nerList.add(ner);
+ }
+ ner.end = offset.endOffset();
+ }
+ BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
+ MorphoFeatures morpho = null;
+ if(baseFormAttr != null && baseFormAttr.getBaseForm() != null){
+ morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
+ morpho.addPos(posTag); //and add the posTag
+ }
+ InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
+ inflectionAttr.getInflectionForm();
+ inflectionAttr.getInflectionType();
+ if(morpho != null){ //if present add the morpho
+ token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
+ }
+ }
+ //we still need to write the last sentence
+ Sentence lastSent = null;
+ if(offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset){
+ lastSent = at.addSentence(sentStartOffset, offset.endOffset());
+ }
+ //and set the context off remaining named entities
+ while(nerSentIndex < nerList.size()){
+ if(lastSent != null){
+ nerList.get(nerSentIndex).context = lastSent.getSpan();
+ } else { //no sentence detected
+ nerList.get(nerSentIndex).context = at.getSpan();
+ }
+ nerSentIndex++;
+ }
+ } catch (IOException e) {
+ throw new EngineException(this, ci, "Exception while reading from "
+ + "AnalyzedText contentpart",e);
+ } finally {
+ try {
+ tokenStream.close();
+ } catch (IOException e) {/* ignore */}
+ }
+ //finally write the NER annotations to the metadata of the ContentItem
+ final MGraph metadata = ci.getMetadata();
+ ci.getLock().writeLock().lock();
+ try {
+ Language lang = new Language("ja");
+ for(NerData nerData : nerList){
+ UriRef ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(
+ at.getSpan().substring(nerData.start, nerData.end),lang)));
+ metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
+ metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
+ metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
+ metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT,
+ new PlainLiteralImpl(nerData.context, lang)));
+ }
+ } finally{
+ ci.getLock().writeLock().unlock();
+ }
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return SERVICE_PROPERTIES;
+ }
+ /**
+ * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
+ * CONFIG_LANGUAGES.
+ *
+ * @param ce the {@link org.osgi.service.component.ComponentContext}
+ */
+ @Activate
+ protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
+ log.info("activating smartcn tokenizing engine");
+ super.activate(ce);
+ //init the Solr ResourceLoader used for initialising the components
+ resourceLoader = new StanbolResourceLoader(parentResourceLoader);
+ tokenizerFactory = new JapaneseTokenizerFactory();
+ tokenizerFactory.init(TOKENIZER_FACTORY_CONFIG);
+ tokenizerFactory.setLuceneMatchVersion(LUCENE_VERSION);
+ ((ResourceLoaderAware) tokenizerFactory).inform(resourceLoader);
+ //base form filter
+ TokenFilterFactory baseFormFilterFactory = new JapaneseBaseFormFilterFactory();
+ baseFormFilterFactory.init(BASE_FORM_FILTER_CONFIG);
+ baseFormFilterFactory.setLuceneMatchVersion(LUCENE_VERSION);
+ filterFactories.add(baseFormFilterFactory);
+ //POS filter
+ TokenFilterFactory posFilterFactory = new JapanesePartOfSpeechStopFilterFactory();
+ posFilterFactory.init(POS_FILTER_CONFIG);
+ posFilterFactory.setLuceneMatchVersion(LUCENE_VERSION);
+ ((ResourceLoaderAware) posFilterFactory).inform(resourceLoader);
+ filterFactories.add(posFilterFactory);
+ //Stemming
+ TokenFilterFactory stemmFilterFactory = new JapaneseKatakanaStemFilterFactory();
+ stemmFilterFactory.init(STEMM_FILTER_CONFIG);
+ stemmFilterFactory.setLuceneMatchVersion(LUCENE_VERSION);
+ filterFactories.add(stemmFilterFactory);
+ }
+
+ @Deactivate
+ protected void deactivate(ComponentContext context) {
+ tokenizerFactory = null;
+ filterFactories.clear();
+ filterFactories = null;
+ super.deactivate(context);
+ }
+
+ /**
+ * This is an internal helper class that avoids to execute sentences
+ * using the {@link SentenceTokenizer} twice.
+ * @author Rupert Westenthaler
+ *
+ */
+ protected final class AnalyzedTextSentenceTokenizer extends Tokenizer {
+ private final AnalysedText at;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private Iterator<Sentence> sentences;
+ private Sentence sentence = null;
+
+ protected AnalyzedTextSentenceTokenizer(AnalysedText at) {
+ super(new StringReader(at.getText().toString()));
+ this.at = at;
+ sentences = at.getSentences();
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if(sentences.hasNext()){
+ sentence = sentences.next();
+ termAtt.setEmpty().append(sentence.getSpan());
+ offsetAtt.setOffset(sentence.getStart(),sentence.getEnd());
+ typeAtt.setType("sentence");
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ // set final offset
+ offsetAtt.setOffset(at.getEnd(), at.getEnd());
+ }
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ sentences = at.getSentences();
+ termAtt.setEmpty();
+ offsetAtt.setOffset(0, 0);
+ typeAtt.setType(null);
+ }
+ }
+}
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/NerData.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,25 @@
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+
+/**
+ * Used as intermediate representation of NER annotations so that one needs
+ * not to obtain a write lock on the {@link ContentItem} for each detected
+ * entity
+ * @author Rupert Westenthaler
+ *
+ */
+class NerData {
+
+ protected final NerTag tag;
+ protected final int start;
+ protected int end;
+ protected String context;
+
+ protected NerData(NerTag ner, int start){
+ this.tag = ner;
+ this.start = start;
+ }
+
+}
\ No newline at end of file
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Mar 11 13:18:59 2013
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+org.apache.stanbol.enhancer.engines.smartcn.impl.SmartcnTokenizerEngine.name=Apache \
+Stanbol Enhancer Engine: Smartcn Tokenizer
+org.apache.stanbol.enhancer.engines.opennlp.token.impl.OpenNlpTokenizerEngine.description=Enhancement \
+Engine that detect sentences and tokenizes Chinese text by using the Solr/Lucene \
+smartcn analyzers.
+
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/main/resources/nostoptags.txt Mon Mar 11 13:18:59 2013
@@ -0,0 +1 @@
+# this file is loaded by the POS Filter Factory
\ No newline at end of file
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/ClasspathDataFileProvider.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** DataFileProvider that looks in our class resources */
+public class ClasspathDataFileProvider implements DataFileProvider {
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+ /*
+ * NOTE: This path needs to be the same as path configured for the
+ * 'org.apache.stanbol:org.apache.stanbol.commons.solr.extras.gosen'
+ * bundle
+ */
+ public static final String RESOURCE_BASE_PATH = "datafiles/";
+
+ private final String symbolicName;
+
+ ClasspathDataFileProvider(String bundleSymbolicName) {
+ symbolicName = bundleSymbolicName;
+ }
+
+ @Override
+ public InputStream getInputStream(String bundleSymbolicName,
+ String filename, Map<String, String> comments)
+ throws IOException {
+ final URL dataFile = getDataFile(bundleSymbolicName, filename);
+
+ // Returning null is fine - if we don't have the data file, another
+ // provider might supply it
+ return dataFile != null ? dataFile.openStream() : null;
+ }
+ @Override
+ public boolean isAvailable(String bundleSymbolicName, String filename, Map<String,String> comments) {
+ return getDataFile(bundleSymbolicName, filename) != null;
+ }
+ /**
+ * @param bundleSymbolicName
+ * @param filename
+ * @return
+ */
+ private URL getDataFile(String bundleSymbolicName, String filename) {
+ //If the symbolic name is not null check that is equals to the symbolic
+ //name used to create this classpath data file provider
+ if(bundleSymbolicName != null && !symbolicName.equals(bundleSymbolicName)) {
+ log.debug("Requested bundleSymbolicName {} does not match mine ({}), request ignored",
+ bundleSymbolicName, symbolicName);
+ return null;
+ }
+
+ // load default OpenNLP models from classpath (embedded in the defaultdata bundle)
+ final String resourcePath = RESOURCE_BASE_PATH + filename;
+ final URL dataFile = getClass().getClassLoader().getResource(resourcePath);
+ //log.debug("Resource {} found: {}", (in == null ? "NOT" : ""), resourcePath);
+ return dataFile;
+ }
+}
Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java?rev=1455131&view=auto
==============================================================================
--- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java (added)
+++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/MockComponentContext.java Mon Mar 11 13:18:59 2013
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.kuromoji.impl;
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.Dictionary;
+import java.util.Hashtable;
+
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.BundleException;
+import org.osgi.framework.BundleListener;
+import org.osgi.framework.Filter;
+import org.osgi.framework.FrameworkListener;
+import org.osgi.framework.InvalidSyntaxException;
+import org.osgi.framework.ServiceListener;
+import org.osgi.framework.ServiceReference;
+import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.component.ComponentContext;
+import org.osgi.service.component.ComponentInstance;
+
+public class MockComponentContext implements ComponentContext {
+
+ protected final Dictionary<String, Object> properties;
+ protected final BundleContext bundleContext = new MockBundleContext();
+
+ public MockComponentContext() {
+ properties = new Hashtable<String, Object>();
+ }
+
+ public MockComponentContext(Dictionary<String, Object> properties) {
+ this.properties = properties;
+ }
+
+ public void disableComponent(String name) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public void enableComponent(String name) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public BundleContext getBundleContext() {
+ return bundleContext;
+ }
+
+ public ComponentInstance getComponentInstance() {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public Dictionary<String, Object> getProperties() {
+ return properties;
+ }
+
+ public ServiceReference getServiceReference() {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public Bundle getUsingBundle() {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public Object locateService(String name) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public Object locateService(String name, ServiceReference reference) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ public Object[] locateServices(String name) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ private static final class MockBundleContext implements BundleContext {
+ /**
+ * Used by the Engine to read System properties
+ */
+ @Override
+ public String getProperty(String key) {
+ return System.getProperty(key);
+ }
+
+ @Override
+ public Bundle getBundle() {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public Bundle installBundle(String location) throws BundleException {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public Bundle installBundle(String location, InputStream input) throws BundleException {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public Bundle getBundle(long id) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public Bundle[] getBundles() {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void addServiceListener(ServiceListener listener, String filter) throws InvalidSyntaxException {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void addServiceListener(ServiceListener listener) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void removeServiceListener(ServiceListener listener) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void addBundleListener(BundleListener listener) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void removeBundleListener(BundleListener listener) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void addFrameworkListener(FrameworkListener listener) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public void removeFrameworkListener(FrameworkListener listener) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public ServiceRegistration registerService(String[] clazzes, Object service, Dictionary properties) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public ServiceRegistration registerService(String clazz, Object service, Dictionary properties) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public ServiceReference[] getServiceReferences(String clazz, String filter) throws InvalidSyntaxException {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public ServiceReference[] getAllServiceReferences(String clazz, String filter) throws InvalidSyntaxException {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public ServiceReference getServiceReference(String clazz) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public Object getService(ServiceReference reference) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public boolean ungetService(ServiceReference reference) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public File getDataFile(String filename) {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ @Override
+ public Filter createFilter(String filter) throws InvalidSyntaxException {
+ throw new UnsupportedOperationException("Mock implementation");
+ }
+
+ }
+}