You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/12/24 14:00:47 UTC

svn commit: r1425643 - in /stanbol/trunk/launchers/bundlelists: ./ language-extras/ language-extras/smartcn/ language-extras/smartcn/src/ language-extras/smartcn/src/main/ language-extras/smartcn/src/main/bundles/

Author: rwesten
Date: Mon Dec 24 13:00:47 2012
New Revision: 1425643

URL: http://svn.apache.org/viewvc?rev=1425643&view=rev
Log:
Added BundleList for STANBOL-855. Including this List to the Stanbol Launcher will ensure that all components for basic Chinese language support are present

Added:
    stanbol/trunk/launchers/bundlelists/language-extras/
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/   (with props)
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/README.md
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/pom.xml   (with props)
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/bundles/
    stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/bundles/list.xml
Modified:
    stanbol/trunk/launchers/bundlelists/pom.xml

Propchange: stanbol/trunk/launchers/bundlelists/language-extras/smartcn/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Dec 24 13:00:47 2012
@@ -0,0 +1,5 @@
+target
+
+.settings
+
+.project

Added: stanbol/trunk/launchers/bundlelists/language-extras/smartcn/README.md
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/language-extras/smartcn/README.md?rev=1425643&view=auto
==============================================================================
--- stanbol/trunk/launchers/bundlelists/language-extras/smartcn/README.md (added)
+++ stanbol/trunk/launchers/bundlelists/language-extras/smartcn/README.md Mon Dec 24 13:00:47 2012
@@ -0,0 +1,81 @@
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+Basic Chinese language support based on Lucene Smartcn Analyzer
+==============
+
+This BundleList includes three modules that bring basic language support for Chinese to Apache Stanbol.
+
+See comments in the [lists.xml](src/main/bundles/list.xml) for more details.
+
+Solr Field Configuration
+---
+
+When you plan to use the Smartcn Analyzer to process Chinese texts it is important to also properly configure the Solr schema.xml used by the Entityhub SolrYard.
+
+For that you will need to add two things:
+
+1. A fieldType specification for Chinese
+
+    :::xml
+    <fieldType name="text_zh" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.SmartChineseSentenceTokenizerFactory"/>
+        <filter class="solr.SmartChineseWordTokenFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.SmartChineseSentenceTokenizerFactory"/>
+        <filter class="solr.SmartChineseWordTokenFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+        <filter class="solr.PositionFilterFactory" />
+      </analyzer>
+    </fieldType> 
+
+2. A dynamic field using this field type that matches against Chinese language literals
+
+    :::xml
+    <!--
+     Dynamic field for Chinese languages.
+     -->
+    <dynamicField name="@zh*" type="text_zh" indexed="true" stored="true" multiValued="true" omitNorms="false"/>
+
+The [smartcn.solrindex.zip](https://svn.apache.org/repos/asf/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/smartcn.solrindex.zip) is identical with the default configuration but uses the above fieldType and dynamicField specification.
+
+### Usage with the EntityhubIndexing Tool
+
+1. Extract the [smartcn.solrindex.zip](https://svn.apache.org/repos/asf/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/smartcn.solrindex.zip) to the "indexing/config" directory 
+2. Rename the "indexing/config/smartcn" directory to the {site-name} (the value of the "name" property of the "indexing/config/indexing.properties" file).
+
+As an alternative to (2) you can also explicitly configure the name of the solr config as value to the "solrConf:smartcn" of SolrYardIndexingDestination.
+
+    :::text
+    indexingDestination=org.apache.stanbol.entityhub.indexing.destination.solryard.SolrYardIndexingDestination,solrConf:smartcn,boosts:fieldboosts
+
+### Usage with the Entityhub SolrYard
+
+If you want to create an empty SolrYard instance using the [smartcn.solrindex.zip](https://svn.apache.org/repos/asf/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/smartcn.solrindex.zip) configuration you will need to
+
+1. copy the smartcn.solrindex.zip to the datafile directory of your Stanbol instance ({working-dir}/stanbol/datafiles)
+2. rename it to the {name} of the SolrYard you want to create. The file name needs to be {name}.solrindex.zip
+3. create the SolrYard instance and configure the "Solr Index/Core" (org.apache.stanbol.entityhub.yard.solr.solrUri) to {name}. Make sure the "Use default SolrCore configuration" (org.apache.stanbol.entityhub.yard.solr.useDefaultConfig) is disabled.
+
+If you want to use the smartcn.solrindex.zip as default you can rename the file in the datafilee folder to "default.solrindex.zip" and the enable the "Use default SolrCore configuration" (org.apache.stanbol.entityhub.yard.solr.useDefaultConfig) when you configure a SolrYard instance.
+
+See also the documentation on how to [configure a managed site](http://stanbol.apache.org/docs/trunk/components/entityhub/managedsite#configuration-of-managedsites)).

Added: stanbol/trunk/launchers/bundlelists/language-extras/smartcn/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/language-extras/smartcn/pom.xml?rev=1425643&view=auto
==============================================================================
--- stanbol/trunk/launchers/bundlelists/language-extras/smartcn/pom.xml (added)
+++ stanbol/trunk/launchers/bundlelists/language-extras/smartcn/pom.xml Mon Dec 24 13:00:47 2012
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  You under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>stanbol-parent</artifactId>
+    <version>2-SNAPSHOT</version>
+    <relativePath>../../../../parent</relativePath>
+  </parent>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.launchers.bundlelists.languageextras.smartcn</artifactId>
+  <version>0.10.0-SNAPSHOT</version>
+  <packaging>partialbundlelist</packaging>
+
+  <name>Apache Stanbol Bundlelist for Language Support: Smart Chinese</name>
+    <description>
+      Provides modules that allow basic language support for Chinese using
+      the Solr/Lucene smartcn analyzer. This includes a (1) Bundle providing the
+      Solr Analyzer; (2) an NLP processing Engine that detects Sentences and
+      Tokenizes Chinese Text and (3) an LabelTokenizer needed to match tokens of
+      the analyzed text with the labels of Entities in the matched vocabularies.
+    </description>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.sling</groupId>
+        <artifactId>maven-launchpad-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>

Propchange: stanbol/trunk/launchers/bundlelists/language-extras/smartcn/pom.xml
------------------------------------------------------------------------------
    svn:executable = *

Added: stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/bundles/list.xml?rev=1425643&view=auto
==============================================================================
--- stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/bundles/list.xml (added)
+++ stanbol/trunk/launchers/bundlelists/language-extras/smartcn/src/main/bundles/list.xml Mon Dec 24 13:00:47 2012
@@ -0,0 +1,59 @@
+<?xml version="1.0" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<bundles>
+  <!-- 
+   The smartcn analyzer bundle (extension to o.a.s.commons.solr.core module)
+   -->
+  <startLevel level="28"> <!-- commons.solr.core uses startlevel 27 -->
+    <bundle>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.commons.solr.extras.smartcn</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </bundle>
+  </startLevel>
+  
+  <!--
+   The Chinese sentence detection and tokenizer EnhancementEngine based on
+   smartcn
+   -->
+  <startLevel level="35"> <!-- same startlevel as other Enhancement Engines -->
+    <bundle>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.engines.smartcn.token</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </bundle>
+  </startLevel>
+
+  <!--
+   The Chinese LabelTokenizer required by the EntityLinkingEngine to compare
+   Tokens in the AnalyzedText with Labels of the Entities found in the
+   Controlled vocabulary.
+   -->
+  <!-- 
+   startlevel needs to be greater as those of the EntityLinkingEngine
+   (o.a.s.enhancer.engines.entitylinking.engine) module
+   -->
+  <startLevel level="36">
+    <bundle>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.smartcn</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </bundle>
+  </startLevel>
+  
+</bundles>
\ No newline at end of file

Modified: stanbol/trunk/launchers/bundlelists/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/pom.xml?rev=1425643&r1=1425642&r2=1425643&view=diff
==============================================================================
--- stanbol/trunk/launchers/bundlelists/pom.xml (original)
+++ stanbol/trunk/launchers/bundlelists/pom.xml Mon Dec 24 13:00:47 2012
@@ -79,6 +79,9 @@
     <!-- Specific features -->
     <module>security</module>
     <module>zzshell</module>
+    
+    <!-- language specific extensions -->
+    <module>language-extras/smartcn</module>
   </modules>
 
   <profiles>