You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/07/12 10:21:14 UTC

svn commit: r1360575 - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ data/bundlelists/opennlp/src/main/bundles/ data/opennlp/lang/es/ data/opennlp/lang/es/src/ data/opennlp/lang/es/src/main/ data/opennlp...

Author: rwesten
Date: Thu Jul 12 08:21:14 2012
New Revision: 1360575

URL: http://svn.apache.org/viewvc?rev=1360575&view=rev
Log:
fixes STANBOL-688 by adding a module for Spanish POS and updating the POSTagsCollectionEnum

Added:
    incubator/stanbol/trunk/data/opennlp/lang/es/   (with props)
    incubator/stanbol/trunk/data/opennlp/lang/es/README.md
    incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml   (with props)
    incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml   (with props)
    incubator/stanbol/trunk/data/opennlp/lang/es/src/
    incubator/stanbol/trunk/data/opennlp/lang/es/src/main/
    incubator/stanbol/trunk/data/opennlp/lang/es/src/main/resources/
    incubator/stanbol/trunk/data/opennlp/lang/es/src/main/resources/org/   (with props)
Modified:
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
    incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml

Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1360575&r1=1360574&r2=1360575&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java Thu Jul 12 08:21:14 2012
@@ -180,7 +180,28 @@ public enum PosTagsCollectionEnum {
      * NOTES: this includes  prepositions, Part of idiom, Infinitive marker
      *  as well as all kinds of punctuations
      */
-    SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU");
+    SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU"),
+    /**
+     * Nouns related POS types for Spanish language.
+     * I was not able to find the list, so POS tag results where used to 
+     * create this configuration.
+     */
+    ES_NOUN("es",PosTypeCollectionType.NOUN,"NC","NP","Z"),
+    /**
+     * Verb related POS types for Spanish language.
+     * I was not able to find the list, so POS tag results where used to 
+     * create this configuration 
+     */
+    ES_VERB("es",PosTypeCollectionType.VERB,"VMI", "VMS", "VMM", "VMC", "VMN",
+        "VMG", "VMP", "VAI", "VAS","VAM", "VAC", "VAN", "VAG", "VAP"),
+    /**
+     * POS types one needs typically to follow to build {@link Chunk}s over 
+     * Nouns (e.g. "University_NN of_IN Otago_NNP" or "Geneva_NNP ,_, Ohio_NNP").
+     * I was not able to find the list, so POS tag results where used to 
+     * create this configuration.<p>
+     * For now "SP" and all "F*" tokens referring to '.', ';', ...
+     */
+    ES_FOLLOW("es",PosTypeCollectionType.FOLLOW,"AQ","SP","Fc","Ft","Fp","Fe","Fd","Fx","Fat","Fit","Fpa","Fpt","Fg","Faa","Ft");
     Set<String> tags;
     private String language;
     private PosTypeCollectionType type;

Modified: incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml?rev=1360575&r1=1360574&r2=1360575&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml (original)
+++ incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml Thu Jul 12 08:21:14 2012
@@ -46,6 +46,11 @@
       <artifactId>org.apache.stanbol.data.opennlp.lang.en</artifactId>
       <version>1.0.2-incubating-SNAPSHOT</version>
     </bundle>
+    <bundle> <!-- Spanish POS tagging support (STANBOL-688) -->
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.data.opennlp.lang.es</artifactId>
+      <version>1.0.0-incubating-SNAPSHOT</version>
+    </bundle>
     <bundle>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.data.opennlp.lang.da</artifactId>

Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jul 12 08:21:14 2012
@@ -0,0 +1,7 @@
+.project
+
+.settings
+
+target
+
+.classpath

Added: incubator/stanbol/trunk/data/opennlp/lang/es/README.md
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/es/README.md?rev=1360575&view=auto
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/es/README.md (added)
+++ incubator/stanbol/trunk/data/opennlp/lang/es/README.md Thu Jul 12 08:21:14 2012
@@ -0,0 +1,37 @@
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+# Data files Bundles for OpenNLP
+
+This source repository only holds the pom.xml file and folder structure of this bundle.
+
+To avoid loading subversion repository with large binary files this artifact has to be build and deployed manually to retrieve precomputed models from other sites.
+
+
+## Downloading the OpenNLP statistical model 
+
+The OpenNLP models are downloaded from 
+
+    http://opennlp.sourceforge.net/models-1.5
+    https://github.com/utcompling/OpenNLP-Models
+
+This url is defined as property in the 'pom.xml'
+The list of downloaded file is defined within the 'download_models.xml'
+
+## NOTES
+
+* Using this bundles is only an alternative of manually copying the required OpenNLP models to the '{stanbol-installation}/stanbol/datafiles'.
+* This uses the Sentence detector for Portuguese as their is no one available for Spanish
+* The POS model for Spanish is downloaded form github

Added: incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml?rev=1360575&view=auto
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml (added)
+++ incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml Thu Jul 12 08:21:14 2012
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project name="OpenNLP Model Download Helper" default="download" basedir=".">
+  <description>
+    Contains only a singel target that is used by the Maven Ant
+    Plugin to download OpenNLP Models from the Web
+  </description>
+   
+  <target name="download">
+    <copy toDir="${target.directory}/">
+        <!-- there is no sentence detector for es
+             so we use the pt for now. -->
+        <resources>
+            <url url="${sf.model.url}/pt-sent.bin"/>
+        </resources>
+        <mergemapper to="es-sent.bin"/>
+    </copy>
+    <copy toDir="${target.directory}/">
+        <!-- there is no sentence detector for es
+             so we use the pt for now. -->
+        <resources>
+            <url url="${git.model.url}/opennlp-es-maxent-pos-es.bin"/>
+        </resources>
+        <mergemapper to="es-pos-maxent.bin"/>
+    </copy>
+  </target>
+</project>
\ No newline at end of file

Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml?rev=1360575&view=auto
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml (added)
+++ incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml Thu Jul 12 08:21:14 2012
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>org.apache.stanbol.data.parent</artifactId>
+    <version>0.10.1-incubating-SNAPSHOT</version>
+    <relativePath>../../../parent</relativePath>
+  </parent>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.data.opennlp.lang.es</artifactId>
+  <version>1.0.0-incubating-SNAPSHOT</version>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Data: OpenNLP Models for Spanish</name>
+  <description>
+    Bundle containing all necessary/available models for parsing Spanish language texts. 
+    This does not include Models for named entity recocnition (NER).
+  </description>
+  <inceptionYear>2012</inceptionYear>
+
+  <scm>
+    <connection>
+      scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/data/opennlp/lang/es
+    </connection>
+    <developerConnection>
+      scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/data/opennlp/lang/es
+    </developerConnection>
+    <url>http://incubator.apache.org/stanbol/</url>
+  </scm>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <!-- define the path to/home of the OpenNLP modles-->
+    <opennlp.model.path>org/apache/stanbol/data/opennlp</opennlp.model.path>
+    <opennlp.model.home>http://dev.iks-project.eu/downloads/opennlp/models-1.5</opennlp.model.home>
+    <opennlp.git.model.home>https://github.com/utcompling/OpenNLP-Models/raw/58ef0c60031403e66e47ae35edaf58d3478b67af/models/es</opennlp.git.model.home>
+  </properties>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <configuration>
+          <instructions>
+            <_versionpolicy>$${version;===;${@}}</_versionpolicy>
+
+            <!-- 
+              Extension used to provide files in that directory to the
+              DataFileProvider
+              -->
+            <Data-Files>${opennlp.model.path}</Data-Files>
+            <!-- 
+              Use a priority lower than 0 to allow providers without a
+              defined ranking to override this default data.
+             -->
+            <Data-Files-Priority>
+              -100
+            </Data-Files-Priority>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <!-- 
+          Ant is used to download the models from the
+          http://opennlp.sourceforge.net site.
+        -->
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>compile</id>
+            <phase>compile</phase>
+            <configuration>
+              <!--
+                TODO: I would like to add an "unless" constraint to the
+                target that prevents execution if Maven operates in offline
+                mode. However I was not able to find out how to obtain this
+                information. ${settings.offline} (as noted by several
+                resources) does not work.
+                Until fixed builds will fail if no internetconnection is
+                available!
+              -->
+              <target>
+                <property name="target.directory" value="${project.basedir}/src/main/resources/${opennlp.model.path}"/>
+                <property name="sf.model.url" value="${opennlp.model.home}"/>
+                <property name="git.model.url" value="${opennlp.git.model.home}"/>
+                                
+                <echo message="copy OpenNLP models"/>
+                <echo message="  FROM ${model.url} "/>
+                <echo message="  TO ${target.directory}"/>
+
+                <ant antfile="${basedir}/download_models.xml">
+                  <target name="download"/>
+                </ant>
+              </target>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes><!-- exclude OpenNLP model files -->
+              <exclude>**/*.bin</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>

Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/src/main/resources/org/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jul 12 08:21:14 2012
@@ -0,0 +1 @@
+apache