You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/07/12 10:21:14 UTC
svn commit: r1360575 - in /incubator/stanbol/trunk:
commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/
data/bundlelists/opennlp/src/main/bundles/ data/opennlp/lang/es/
data/opennlp/lang/es/src/ data/opennlp/lang/es/src/main/ data/opennlp...
Author: rwesten
Date: Thu Jul 12 08:21:14 2012
New Revision: 1360575
URL: http://svn.apache.org/viewvc?rev=1360575&view=rev
Log:
fixes STANBOL-688 by adding a module for Spanish POS and updating the POSTagsCollectionEnum
Added:
incubator/stanbol/trunk/data/opennlp/lang/es/ (with props)
incubator/stanbol/trunk/data/opennlp/lang/es/README.md
incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml (with props)
incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml (with props)
incubator/stanbol/trunk/data/opennlp/lang/es/src/
incubator/stanbol/trunk/data/opennlp/lang/es/src/main/
incubator/stanbol/trunk/data/opennlp/lang/es/src/main/resources/
incubator/stanbol/trunk/data/opennlp/lang/es/src/main/resources/org/ (with props)
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml
Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1360575&r1=1360574&r2=1360575&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java Thu Jul 12 08:21:14 2012
@@ -180,7 +180,28 @@ public enum PosTagsCollectionEnum {
* NOTES: this includes prepositions, Part of idiom, Infinitive marker
* as well as all kinds of punctuations
*/
- SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU");
+ SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU"),
+ /**
+ * Nouns related POS types for Spanish language.
+ * I was not able to find the list, so POS tag results where used to
+ * create this configuration.
+ */
+ ES_NOUN("es",PosTypeCollectionType.NOUN,"NC","NP","Z"),
+ /**
+ * Verb related POS types for Spanish language.
+ * I was not able to find the list, so POS tag results where used to
+ * create this configuration
+ */
+ ES_VERB("es",PosTypeCollectionType.VERB,"VMI", "VMS", "VMM", "VMC", "VMN",
+ "VMG", "VMP", "VAI", "VAS","VAM", "VAC", "VAN", "VAG", "VAP"),
+ /**
+ * POS types one needs typically to follow to build {@link Chunk}s over
+ * Nouns (e.g. "University_NN of_IN Otago_NNP" or "Geneva_NNP ,_, Ohio_NNP").
+ * I was not able to find the list, so POS tag results where used to
+ * create this configuration.<p>
+ * For now "SP" and all "F*" tokens referring to '.', ';', ...
+ */
+ ES_FOLLOW("es",PosTypeCollectionType.FOLLOW,"AQ","SP","Fc","Ft","Fp","Fe","Fd","Fx","Fat","Fit","Fpa","Fpt","Fg","Faa","Ft");
Set<String> tags;
private String language;
private PosTypeCollectionType type;
Modified: incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml?rev=1360575&r1=1360574&r2=1360575&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml (original)
+++ incubator/stanbol/trunk/data/bundlelists/opennlp/src/main/bundles/list.xml Thu Jul 12 08:21:14 2012
@@ -46,6 +46,11 @@
<artifactId>org.apache.stanbol.data.opennlp.lang.en</artifactId>
<version>1.0.2-incubating-SNAPSHOT</version>
</bundle>
+ <bundle> <!-- Spanish POS tagging support (STANBOL-688) -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.data.opennlp.lang.es</artifactId>
+ <version>1.0.0-incubating-SNAPSHOT</version>
+ </bundle>
<bundle>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.data.opennlp.lang.da</artifactId>
Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jul 12 08:21:14 2012
@@ -0,0 +1,7 @@
+.project
+
+.settings
+
+target
+
+.classpath
Added: incubator/stanbol/trunk/data/opennlp/lang/es/README.md
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/es/README.md?rev=1360575&view=auto
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/es/README.md (added)
+++ incubator/stanbol/trunk/data/opennlp/lang/es/README.md Thu Jul 12 08:21:14 2012
@@ -0,0 +1,37 @@
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+# Data files Bundles for OpenNLP
+
+This source repository only holds the pom.xml file and folder structure of this bundle.
+
+To avoid loading subversion repository with large binary files this artifact has to be build and deployed manually to retrieve precomputed models from other sites.
+
+
+## Downloading the OpenNLP statistical model
+
+The OpenNLP models are downloaded from
+
+ http://opennlp.sourceforge.net/models-1.5
+ https://github.com/utcompling/OpenNLP-Models
+
+This url is defined as property in the 'pom.xml'
+The list of downloaded file is defined within the 'download_models.xml'
+
+## NOTES
+
+* Using this bundles is only an alternative of manually copying the required OpenNLP models to the '{stanbol-installation}/stanbol/datafiles'.
+* This uses the Sentence detector for Portuguese as their is no one available for Spanish
+* The POS model for Spanish is downloaded form github
Added: incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml?rev=1360575&view=auto
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml (added)
+++ incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml Thu Jul 12 08:21:14 2012
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="OpenNLP Model Download Helper" default="download" basedir=".">
+ <description>
+ Contains only a singel target that is used by the Maven Ant
+ Plugin to download OpenNLP Models from the Web
+ </description>
+
+ <target name="download">
+ <copy toDir="${target.directory}/">
+ <!-- there is no sentence detector for es
+ so we use the pt for now. -->
+ <resources>
+ <url url="${sf.model.url}/pt-sent.bin"/>
+ </resources>
+ <mergemapper to="es-sent.bin"/>
+ </copy>
+ <copy toDir="${target.directory}/">
+ <!-- there is no sentence detector for es
+ so we use the pt for now. -->
+ <resources>
+ <url url="${git.model.url}/opennlp-es-maxent-pos-es.bin"/>
+ </resources>
+ <mergemapper to="es-pos-maxent.bin"/>
+ </copy>
+ </target>
+</project>
\ No newline at end of file
Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/download_models.xml
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml?rev=1360575&view=auto
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml (added)
+++ incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml Thu Jul 12 08:21:14 2012
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.data.parent</artifactId>
+ <version>0.10.1-incubating-SNAPSHOT</version>
+ <relativePath>../../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.data.opennlp.lang.es</artifactId>
+ <version>1.0.0-incubating-SNAPSHOT</version>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Data: OpenNLP Models for Spanish</name>
+ <description>
+ Bundle containing all necessary/available models for parsing Spanish language texts.
+ This does not include Models for named entity recocnition (NER).
+ </description>
+ <inceptionYear>2012</inceptionYear>
+
+ <scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/data/opennlp/lang/es
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/data/opennlp/lang/es
+ </developerConnection>
+ <url>http://incubator.apache.org/stanbol/</url>
+ </scm>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <!-- define the path to/home of the OpenNLP modles-->
+ <opennlp.model.path>org/apache/stanbol/data/opennlp</opennlp.model.path>
+ <opennlp.model.home>http://dev.iks-project.eu/downloads/opennlp/models-1.5</opennlp.model.home>
+ <opennlp.git.model.home>https://github.com/utcompling/OpenNLP-Models/raw/58ef0c60031403e66e47ae35edaf58d3478b67af/models/es</opennlp.git.model.home>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <configuration>
+ <instructions>
+ <_versionpolicy>$${version;===;${@}}</_versionpolicy>
+
+ <!--
+ Extension used to provide files in that directory to the
+ DataFileProvider
+ -->
+ <Data-Files>${opennlp.model.path}</Data-Files>
+ <!--
+ Use a priority lower than 0 to allow providers without a
+ defined ranking to override this default data.
+ -->
+ <Data-Files-Priority>
+ -100
+ </Data-Files-Priority>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <!--
+ Ant is used to download the models from the
+ http://opennlp.sourceforge.net site.
+ -->
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-antrun-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>compile</id>
+ <phase>compile</phase>
+ <configuration>
+ <!--
+ TODO: I would like to add an "unless" constraint to the
+ target that prevents execution if Maven operates in offline
+ mode. However I was not able to find out how to obtain this
+ information. ${settings.offline} (as noted by several
+ resources) does not work.
+ Until fixed builds will fail if no internetconnection is
+ available!
+ -->
+ <target>
+ <property name="target.directory" value="${project.basedir}/src/main/resources/${opennlp.model.path}"/>
+ <property name="sf.model.url" value="${opennlp.model.home}"/>
+ <property name="git.model.url" value="${opennlp.git.model.home}"/>
+
+ <echo message="copy OpenNLP models"/>
+ <echo message=" FROM ${model.url} "/>
+ <echo message=" TO ${target.directory}"/>
+
+ <ant antfile="${basedir}/download_models.xml">
+ <target name="download"/>
+ </ant>
+ </target>
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes><!-- exclude OpenNLP model files -->
+ <exclude>**/*.bin</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/pom.xml
------------------------------------------------------------------------------
svn:mime-type = text/plain
Propchange: incubator/stanbol/trunk/data/opennlp/lang/es/src/main/resources/org/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jul 12 08:21:14 2012
@@ -0,0 +1 @@
+apache