You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2016/12/28 03:45:05 UTC

[1/6] opennlp git commit: Morfologik add-on depends on the Tools version as stated in parent pom.

Repository: opennlp
Updated Branches:
  refs/heads/trunk 75ab49511 -> 34730d200


Morfologik add-on depends on the Tools version as stated in parent pom.

Fixed pom.xml by removing hardcoded version

See issue OPENNLP-902


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/e5937c9e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/e5937c9e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/e5937c9e

Branch: refs/heads/trunk
Commit: e5937c9ecfe85d8e2f657f48f20d51062fb54851
Parents: a36d013
Author: William Colen <co...@apache.org>
Authored: Wed Dec 28 00:58:08 2016 -0200
Committer: William Colen <co...@apache.org>
Committed: Wed Dec 28 00:58:08 2016 -0200

----------------------------------------------------------------------
 opennlp-morfologik-addon/pom.xml | 1 -
 1 file changed, 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/e5937c9e/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index ddfedf2..e708267 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -91,7 +91,6 @@
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
 			<artifactId>opennlp-tools</artifactId>
-			<version>1.6.0</version>
 		</dependency>
 
 		<dependency>


[5/6] opennlp git commit: Adds a small documentation section for Morfologik add-on

Posted by co...@apache.org.
Adds a small documentation section for Morfologik add-on

See issue OPENNLP-902


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/4f2441bc
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/4f2441bc
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/4f2441bc

Branch: refs/heads/trunk
Commit: 4f2441bc1b50502b95a86bff94e8a9544322baf5
Parents: 001b970
Author: William Colen <co...@apache.org>
Authored: Wed Dec 28 01:43:55 2016 -0200
Committer: William Colen <co...@apache.org>
Committed: Wed Dec 28 01:43:55 2016 -0200

----------------------------------------------------------------------
 .../src/docbkx/morfologik-addon.out.xml         |   0
 opennlp-docs/src/docbkx/morfologik-addon.xml    | 153 +++++++++++++++++++
 opennlp-docs/src/docbkx/opennlp.xml             |   1 +
 3 files changed, 154 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/4f2441bc/opennlp-docs/src/docbkx/morfologik-addon.out.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/morfologik-addon.out.xml b/opennlp-docs/src/docbkx/morfologik-addon.out.xml
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/opennlp/blob/4f2441bc/opennlp-docs/src/docbkx/morfologik-addon.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/morfologik-addon.xml b/opennlp-docs/src/docbkx/morfologik-addon.xml
new file mode 100644
index 0000000..6f18844
--- /dev/null
+++ b/opennlp-docs/src/docbkx/morfologik-addon.xml
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	you under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+
+
+<chapter id="tools.morfologik-addon">
+	<title>Morfologik Addon</title>
+		<para>
+			<ulink url="https://github.com/morfologik/morfologik-stemming"><citetitle>Morfologik</citetitle></ulink>
+			provides tools for finite state automata (FSA) construction and dictionary-based morphological dictionaries.
+		</para>
+		<para>
+			The Morfologik Addon implements OpenNLP interfaces and extensions to allow the use of FSA Morfologik dictionary tools.
+		</para>
+		<section id="tools.morfologik-addon.api">
+			<title>Morfologik Integration</title>
+			<para>
+			To allow for an easy integration with OpenNLP, the following implementations are provided:
+			<itemizedlist mark='opencircle'>
+				<listitem>
+					<para>
+					The <code>MorfologikPOSTaggerFactory</code> extends <code>POSTaggerFactory</code>, which helps creating a POSTagger model with an embedded FSA TagDictionary.
+					</para>
+				</listitem>
+				<listitem>
+					<para>
+					The <code>MorfologikTagDictionary</code> implements a FSA based <code>TagDictionary</code>, allowing for much smaller files than the default XML based with improved memory consumption.
+					</para>
+				</listitem>
+				<listitem>
+					<para>
+					The <code>MorfologikLemmatizer</code> implements a FSA based <code>Lemmatizer</code> dictionaries.
+					</para>
+				</listitem>
+			</itemizedlist>
+		</para>
+		<para>
+		The first two implementations can be used directly from command line, as in the example bellow. Having a FSA Morfologik dictionary (see next section how to build one), you can train a POS Tagger
+		model with an embedded FSA dictionary. 
+		</para>
+		<para>
+		The example trains a POSTagger with a CONLL corpus named <code>portuguese_bosque_train.conll</code> and a FSA dictionary named 
+		<code>pt-morfologik.dict</code>. It will output a model named <code>pos-pt_fsadic.model</code>.
+		
+		<screen>
+		<![CDATA[
+$ bin/opennlp POSTaggerTrainer -type perceptron -lang pt -model pos-pt_fsadic.model -data portuguese_bosque_train.conll \
+	 -encoding UTF-8 -factory opennlp.morfologik.tagdict.MorfologikPOSTaggerFactory -dict pt-morfologik.dict]]>
+		</screen>
+		
+		</para>
+		<para>
+		Another example follows. It shows how to use the <code>MorfologikLemmatizer</code>. You will need a lemma dictionary and info file, in this example, we will use a very small Portuguese dictionary. 
+		Its syntax is <code>lemma,lexeme,postag</code>.
+		</para>
+		<para>
+		File <code>lemmaDictionary.txt:</code>
+		<screen>
+		<![CDATA[
+casa,casa,NOUN
+casar,casa,V
+casar,casar,V-INF
+Casa,Casa,PROP
+casa,casinha,NOUN
+casa,casona,NOUN
+menino,menina,NOUN
+menino,menino,NOUN
+menino,menin�o,NOUN
+menino,menininho,NOUN
+carro,carro,NOUN]]>
+		</screen>
+		</para>
+		<para>
+		Mandatory metadata file, which must have the same name but .info extension <code>lemmaDictionary.info:</code>
+		<screen>
+		<![CDATA[
+#
+# REQUIRED PROPERTIES
+#
+
+# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
+fsa.dict.separator=,
+
+# The charset in which the input is encoded. UTF-8 is strongly recommended.
+fsa.dict.encoding=UTF-8
+
+# The type of lemma-inflected form encoding compression that precedes automaton
+# construction. Allowed values: [suffix, infix, prefix, none].
+# Details are in Daciuk's paper and in the code. 
+# Leave at 'prefix' if not sure.
+fsa.dict.encoder=prefix
+		]]>
+		</screen>
+		</para>
+		<para>
+		The following code creates a binary FSA Morfologik dictionary, loads it in MorfologikLemmatizer and uses it to 
+		find the lemma the word "casa" noun and verb.
+		
+				<programlisting language="java">
+		<![CDATA[
+// Part 1: compile a FSA lemma dictionary 
+   
+// we need the tabular dictionary. It is mandatory to have info 
+//  file with same name, but .info extension
+Path textLemmaDictionary = Paths.get("dictionaryWithLemma.txt");
+
+// this will build a binary dictionary located in compiledLemmaDictionary
+Path compiledLemmaDictionary = new MorfologikDictionayBuilder()
+    .build(textLemmaDictionary);
+
+// Part 2: load a MorfologikLemmatizer and use it
+MorfologikLemmatizer lemmatizer = new MorfologikLemmatizer(compiledLemmaDictionary);
+
+String[] toks = {"casa", "casa"};
+String[] tags = {"NOUN", "V"};
+
+String[] lemmas = lemmatizer.lemmatize(toks, tags);
+System.out.println(Arrays.toString(lemmas)); // outputs [casa, casar]
+    ]]>
+			</programlisting>
+		
+		</para>
+		</section>
+		<section id="tools.morfologik-addon.cmdline">
+			<title>Morfologik CLI Tools</title>
+			<para>
+				The Morfologik addon provides a command line tool. <code>XMLDictionaryToTable</code> makes easy to convert from an OpenNLP XML based dictionary
+				to a tabular format. <code>MorfologikDictionaryBuilder</code> can take a tabular dictionary and output a binary Morfologik FSA dictionary.
+			</para>
+			<screen>
+		<![CDATA[
+$ sh bin/morfologik-addon
+OpenNLP Morfologik Addon. Usage: opennlp-morfologik-addon TOOL
+where TOOL is one of:
+  MorfologikDictionaryBuilder    builds a binary POS Dictionary using Morfologik
+  XMLDictionaryToTable           reads an OpenNLP XML tag dictionary and outputs it in a tabular file
+All tools print help when invoked with help parameter
+Example: opennlp-morfologik-addon POSDictionaryBuilder help
+		]]>
+		</screen>
+		</section>
+</chapter>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/4f2441bc/opennlp-docs/src/docbkx/opennlp.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/opennlp.xml b/opennlp-docs/src/docbkx/opennlp.xml
index 257bbb4..172d06c 100644
--- a/opennlp-docs/src/docbkx/opennlp.xml
+++ b/opennlp-docs/src/docbkx/opennlp.xml
@@ -89,5 +89,6 @@ under the License.
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./corpora.xml" />
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./machine-learning.xml" />
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./uima-integration.xml" />
+	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./morfologik-addon.xml" />
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./cli.xml" />
 </book>


[2/6] opennlp git commit: Removes distribution related files

Posted by co...@apache.org.
Removes distribution related files

Now that Morfologik add-on is part of the main distribution, we don\u2019t need lincense and readme files, as well as assembly instructions

See issue OPENNLP-902


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/060c99a6
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/060c99a6
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/060c99a6

Branch: refs/heads/trunk
Commit: 060c99a6b109c452c0f6a5c63b9f30edd8d52c5b
Parents: e5937c9
Author: William Colen <co...@apache.org>
Authored: Wed Dec 28 01:01:43 2016 -0200
Committer: William Colen <co...@apache.org>
Committed: Wed Dec 28 01:01:43 2016 -0200

----------------------------------------------------------------------
 opennlp-morfologik-addon/pom.xml                |  27 ---
 .../src/main/assembly/bin.xml                   |  91 --------
 .../src/main/assembly/src.xml                   |  39 ----
 .../src/main/readme/LICENSE                     | 230 -------------------
 .../src/main/readme/MORFOLOGIK-LICENSE          |  28 ---
 opennlp-morfologik-addon/src/main/readme/NOTICE |  11 -
 6 files changed, 426 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/060c99a6/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index e708267..d7bd311 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -41,33 +41,6 @@
 					</execution>
 				</executions>
 			</plugin>
-			<plugin> 
-	        <artifactId>maven-antrun-plugin</artifactId> 
-	        <version>1.6</version> 
-	        <executions> 
-	          <execution> 
-	            <id>generate checksums for binary artifacts</id> 
-	            <goals><goal>run</goal></goals> 
-	            <phase>verify</phase> 
-	            <configuration> 
-	              <target> 
-	                <checksum algorithm="sha1" format="MD5SUM"> 
-	                  <fileset dir="${project.build.directory}"> 
-	                    <include name="*.zip" /> 
-	                    <include name="*.gz" /> 
-	                  </fileset> 
-	                </checksum> 
-	                <checksum algorithm="md5" format="MD5SUM"> 
-	                  <fileset dir="${project.build.directory}"> 
-	                    <include name="*.zip" /> 
-	                    <include name="*.gz" /> 
-	                  </fileset> 
-	                </checksum> 
-	              </target> 
-	            </configuration> 
-	          </execution> 
-	        </executions> 
-	      </plugin>
 		</plugins>
 	</build>
 	<properties>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060c99a6/opennlp-morfologik-addon/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/assembly/bin.xml b/opennlp-morfologik-addon/src/main/assembly/bin.xml
deleted file mode 100644
index ab4f6da..0000000
--- a/opennlp-morfologik-addon/src/main/assembly/bin.xml
+++ /dev/null
@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.    
--->
-
-<assembly>
-  <id>bin</id>
-  <formats>
-    <format>tar.gz</format>
-    <format>zip</format>
-    <format>dir</format>
-  </formats>
-  
-    <includeBaseDirectory>true</includeBaseDirectory>
-	<baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
-  
-	<dependencySets>
-		<dependencySet>
-			<scope>runtime</scope>
-			<unpack>false</unpack>
-			<useProjectArtifact>false</useProjectArtifact>
-			<fileMode>644</fileMode>
-			<directoryMode>755</directoryMode>
-			<outputDirectory>lib</outputDirectory>
-			<useTransitiveDependencies>true</useTransitiveDependencies>
-		</dependencySet>
-	</dependencySets>
-	
-	<fileSets>
-	    <fileSet>
-	    	<directory>src/main/readme</directory>
-	    	<outputDirectory></outputDirectory>
-	    	<fileMode>644</fileMode>
-	    	<directoryMode>755</directoryMode>      
-	    </fileSet>
-		
-	    <fileSet>
-	      <directory>.</directory>
-	      <outputDirectory></outputDirectory>
-	      <filtered>true</filtered>
-	      <fileMode>644</fileMode>
-	      <directoryMode>755</directoryMode> 
-	      <includes>
-	        <include>README</include>
-	        <include>RELEASE_NOTES.html</include>
-	      </includes>       
-	    </fileSet>
-	    
-	    <fileSet>
-	      <directory>target</directory>
-	      <outputDirectory></outputDirectory>
-	      <fileMode>644</fileMode>
-	      <directoryMode>755</directoryMode> 
-	      <includes>
-	        <include>issuesFixed/**</include>      
-	      </includes>       
-	    </fileSet>
-	    
-		<fileSet>
-			<directory>src/main/bin</directory>
-			<fileMode>755</fileMode>
-			<directoryMode>755</directoryMode>
-			<outputDirectory>bin</outputDirectory>
-		</fileSet>
-		
-		  <fileSet>
-		    <directory>target</directory>
-		    <outputDirectory>lib</outputDirectory>
-		    <includes>
-		      <include>morfologik-addon-*.jar</include>
-		    </includes>
-		  </fileSet>
-		
-	</fileSets>
-</assembly>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060c99a6/opennlp-morfologik-addon/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/assembly/src.xml b/opennlp-morfologik-addon/src/main/assembly/src.xml
deleted file mode 100644
index cdcc9d3..0000000
--- a/opennlp-morfologik-addon/src/main/assembly/src.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<assembly>
-  <id>src</id>
-  <formats>
-    <format>tar.gz</format>
-    <format>zip</format>
-  </formats>
-  
-  <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
-  
-  <fileSets>
-    <fileSet>
-      <directory>../</directory>
-      <outputDirectory></outputDirectory>
-      <excludes>
-        <exclude>**/target/**</exclude>
-        <exclude>**/.*/**</exclude>
-        <exclude>**/pom.xml.releaseBackup</exclude>
-        <exclude>**/release.properties</exclude>
-      </excludes>
-    </fileSet>
-  </fileSets>
-</assembly>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060c99a6/opennlp-morfologik-addon/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/LICENSE b/opennlp-morfologik-addon/src/main/readme/LICENSE
deleted file mode 100644
index 576b4cf..0000000
--- a/opennlp-morfologik-addon/src/main/readme/LICENSE
+++ /dev/null
@@ -1,230 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-The following license applies to the Snowball stemmers:
-        
-        Copyright (c) 2001, Dr Martin Porter
-        Copyright (c) 2002, Richard Boulton
-        All rights reserved.
-        
-        Redistribution and use in source and binary forms, with or without
-        modification, are permitted provided that the following conditions are met:
-        
-            * Redistributions of source code must retain the above copyright notice,
-            * this list of conditions and the following disclaimer.
-            * Redistributions in binary form must reproduce the above copyright
-            * notice, this list of conditions and the following disclaimer in the
-            * documentation and/or other materials provided with the distribution.
-            * Neither the name of the copyright holders nor the names of its contributors
-            * may be used to endorse or promote products derived from this software
-            * without specific prior written permission.
-        
-        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-        AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-        IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-        DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-        FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-        DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-        SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-        CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-        OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060c99a6/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
deleted file mode 100644
index 0554010..0000000
--- a/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, 
-are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice, 
-    this list of conditions and the following disclaimer.
-    
-    * Redistributions in binary form must reproduce the above copyright notice, 
-    this list of conditions and the following disclaimer in the documentation 
-    and/or other materials provided with the distribution.
-    
-    * Neither the name of Morfologik nor the names of its contributors 
-    may be used to endorse or promote products derived from this software 
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060c99a6/opennlp-morfologik-addon/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/NOTICE b/opennlp-morfologik-addon/src/main/readme/NOTICE
deleted file mode 100644
index 73fb1d7..0000000
--- a/opennlp-morfologik-addon/src/main/readme/NOTICE
+++ /dev/null
@@ -1,11 +0,0 @@
-Apache OpenNLP
-Copyright 2010, 2013 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-The snowball stemmers in
-opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
-were developed by Martin Porter and Richard Boulton.
-The full snowball package is available from
-http://snowball.tartarus.org/


[4/6] opennlp git commit: Updates Morfologik add-on with 1.7.0 interfaces

Posted by co...@apache.org.
Updates Morfologik add-on with 1.7.0 interfaces

The Morfologik add-on was not compatible with the latest OpenNLP code. This also simplifies the implementation of the wrapper. Previous code was a little language specific.

See issue OPENNLP-902


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/001b9706
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/001b9706
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/001b9706

Branch: refs/heads/trunk
Commit: 001b970685ef0cb3904d2d8b0b2dfc2462eed870
Parents: 486b880
Author: William Colen <co...@apache.org>
Authored: Wed Dec 28 01:17:13 2016 -0200
Committer: William Colen <co...@apache.org>
Committed: Wed Dec 28 01:17:13 2016 -0200

----------------------------------------------------------------------
 .../builder/XMLDictionaryToTableTool.java       |   2 +-
 .../lemmatizer/MorfologikLemmatizer.java        |  86 +++++++++----------
 .../builder/POSDictionayBuilderTest.java        |  30 ++++++-
 .../lemmatizer/MorfologikLemmatizerTest.java    |  42 +++++++--
 .../tagdict/POSTaggerFactoryTest.java           |  28 ++++--
 .../src/test/resources/dictionaryWithLemma.dict | Bin 0 -> 223 bytes
 .../src/test/resources/dictionaryWithLemma.txt  |  10 ++-
 7 files changed, 129 insertions(+), 69 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
index ef6668e..f3108a4 100644
--- a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -41,7 +41,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
   private String SEPARATOR;
 
   public String getShortDescription() {
-    return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+    return "reads an OpenNLP XML tag dictionary and outputs it in a tabular file";
   }
 
   public String getHelp() {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
index 2798e42..489b6fc 100644
--- a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -20,11 +20,9 @@ package opennlp.morfologik.lemmatizer;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 
@@ -32,66 +30,62 @@ import morfologik.stemming.Dictionary;
 import morfologik.stemming.DictionaryLookup;
 import morfologik.stemming.IStemmer;
 import morfologik.stemming.WordData;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import opennlp.tools.lemmatizer.Lemmatizer;
 
-public class MorfologikLemmatizer implements DictionaryLemmatizer {
+public class MorfologikLemmatizer implements Lemmatizer {
 
   private IStemmer dictLookup;
-  public final Set<String> constantTags = new HashSet<>(Arrays.asList("NNP", "NP00000"));
 
   public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
       IOException {
     dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
   }
 
-  private Map<List<String>, String> getLemmaTagsDict(String word) {
-    List<WordData> wdList = dictLookup.lookup(word);
-    Map<List<String>, String> dictMap = new HashMap<>();
-    for (WordData wd : wdList) {
-      List<String> wordLemmaTags = new ArrayList<>();
-      wordLemmaTags.add(word);
-      wordLemmaTags.add(wd.getTag().toString());
-      dictMap.put(wordLemmaTags, wd.getStem().toString());
+  private List<String> lemmatize(String word, String postag) {
+    List<WordData> dictMap = dictLookup.lookup(word.toLowerCase());
+    Set<String> lemmas = new HashSet<>();
+    for (WordData wordData : dictMap) {
+      if(Objects.equals(postag, asString(wordData.getTag()))) {
+        lemmas.add(asString(wordData.getStem()));
+      }
     }
-    return dictMap;
+    return Collections.unmodifiableList(new ArrayList<>(lemmas));
   }
 
-  private List<String> getDictKeys(String word, String postag) {
-    List<String> keys = new ArrayList<>();
-    if (constantTags.contains(postag)) {
-      keys.addAll(Arrays.asList(word, postag));
-    } else {
-      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
-    }
-    return keys;
+  private String asString(CharSequence tag) {
+    if(tag == null)
+      return null;
+    return tag.toString();
   }
 
-  private Map<List<String>, String> getDictMap(String word, String postag) {
-    Map<List<String>, String> dictMap;
-
-    if (constantTags.contains(postag)) {
-      dictMap = this.getLemmaTagsDict(word);
-    } else {
-      dictMap = this.getLemmaTagsDict(word.toLowerCase());
+  @Override
+  public String[] lemmatize(String[] toks, String[] tags) {
+    String[] lemmas = new String[toks.length];
+    for (int i = 0; i < toks.length; i++) {
+       List<String> l = lemmatize(toks[i],tags[i]);
+      if(l.size() > 0) {
+        lemmas[i] = l.get(0);
+      } else {
+        lemmas[i] = null;
+      }
     }
-    return dictMap;
+    return lemmas;
   }
+  
 
-  public String lemmatize(String word, String postag) {
-    String lemma;
-    List<String> keys = this.getDictKeys(word, postag);
-    Map<List<String>, String> dictMap = this.getDictMap(word, postag);
-    // lookup lemma as value of the map
-    String keyValue = dictMap.get(keys);
-    if (keyValue != null) {
-      lemma = keyValue;
-    } else if (constantTags.contains(postag)) {
-      lemma = word;
-    } else if (Objects.equals(word.toUpperCase(), word)) {
-      lemma = word;
-    } else {
-      lemma = word.toLowerCase();
+  /**
+   * Generates a lemma tags for the word and postag returning the result in list of possible lemmas.
+   *
+   * @param toks an array of the tokens
+   * @param tags an array of the pos tags
+   *
+   * @return an list of possible lemmas for each token in the sequence.
+   */
+  public List<List<String>> lemmatize(List<String> toks, List<String> tags) {
+    List<List<String>> lemmas = new ArrayList<>();
+    for (int i = 0; i < toks.size(); i++) {
+      lemmas.add(lemmatize(toks.get(i),tags.get(i)));
     }
-    return lemma;
+    return lemmas;
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 0a7ba48..4d450ba 100644
--- a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -20,14 +20,16 @@ package opennlp.morfologik.builder;
 import java.io.File;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
+import java.util.Arrays;
+
+import org.junit.Test;
 
 import junit.framework.TestCase;
 import morfologik.stemming.DictionaryMetadata;
 import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
 
-import org.junit.Test;
-
 public class POSDictionayBuilderTest extends TestCase {
 
   @Test
@@ -54,5 +56,29 @@ public class POSDictionayBuilderTest extends TestCase {
     
     return builder.build(tabFilePath);
   }
+  
+  
+  public static void main(String[] args) throws Exception {
+
+    // Part 1: compile a FSA lemma dictionary 
+    
+    // we need the tabular dictionary. It is mandatory to have info 
+    //  file with same name, but .info extension
+    Path textLemmaDictionary = Paths.get("/Users/wcolen/git/opennlp/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt");
+    
+    // this will build a binary dictionary located in compiledLemmaDictionary
+    Path compiledLemmaDictionary = new MorfologikDictionayBuilder()
+        .build(textLemmaDictionary);
+    
+    // Part 2: load a MorfologikLemmatizer and use it
+    MorfologikLemmatizer lemmatizer = new MorfologikLemmatizer(compiledLemmaDictionary);
+    
+    String[] toks = {"casa", "casa"};
+    String[] tags = {"NOUN", "V"};
+    
+    String[] lemmas = lemmatizer.lemmatize(toks, tags);
+    System.out.println(Arrays.toString(lemmas)); // outputs [casa, casar]
+    
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 6b7525e..35757be 100644
--- a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -1,24 +1,50 @@
 package opennlp.morfologik.lemmatizer;
 
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.*;
 
 import java.nio.file.Path;
-
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import java.util.Arrays;
+import java.util.List;
 
 import org.junit.Test;
 
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.Lemmatizer;
+
 public class MorfologikLemmatizerTest {
 
   @Test
   public void testLemmatizeInsensitive() throws Exception {
-    DictionaryLemmatizer dict = createDictionary(false);
+    Lemmatizer dict = createDictionary(false);
+    
+    
+    String[] toks = {"casa", "casa", "Casa"};
+    String[] tags = {"V", "NOUN", "PROP"};
+    
+    String[] lemmas = dict.lemmatize(toks, tags);
 
-    assertEquals("casar", dict.lemmatize("casa", "V"));
-    assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+    assertEquals("casar", lemmas[0]);
+    assertEquals("casa", lemmas[1]);
 
-    assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+    // lookup is case insensitive. There is no entry casa - prop
+    assertNull(lemmas[2]);
+
+  }
+  
+  @Test
+  public void testLemmatizeMultiLemma() throws Exception {
+    MorfologikLemmatizer dict = createDictionary(false);
+    
+    
+    String[] toks = {"foi"};
+    String[] tags = {"V"};
+    
+    List<List<String>> lemmas = dict.lemmatize(Arrays.asList(toks), Arrays.asList(tags));
+
+    
+    assertTrue(lemmas.get(0).contains("ir"));
+    assertTrue(lemmas.get(0).contains("ser"));
+    
 
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 7341a02..354b34c 100644
--- a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -17,28 +17,31 @@
 
 package opennlp.morfologik.tagdict;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.nio.file.Path;
 
+import org.junit.Test;
+
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerFactory;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.postag.TagDictionary;
 import opennlp.tools.postag.WordTagSampleStream;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelType;
 
-import org.junit.Test;
-
 /**
  * Tests for the {@link POSTaggerFactory} class.
  */
@@ -46,10 +49,19 @@ public class POSTaggerFactoryTest {
 
   private static ObjectStream<POSSample> createSampleStream()
       throws IOException {
-    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
-        .getResourceAsStream("AnnotatedSentences.txt");
+    MarkableFileInputStreamFactory sampleDataIn = new MarkableFileInputStreamFactory(
+        new File(POSTaggerFactory.class.getResource("/AnnotatedSentences.txt")
+            .getFile()));
+    
+
+    ObjectStream<String> lineStream = null;
+    try {
+      lineStream = new PlainTextByLineStream(sampleDataIn, "UTF-8");
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
 
-    return new WordTagSampleStream((new InputStreamReader(in)));
+    return new WordTagSampleStream(lineStream);
   }
 
   static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict
new file mode 100644
index 0000000..66288b0
Binary files /dev/null and b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict differ

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
index 09d39e3..3e27a3c 100644
--- a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
+++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
@@ -1,11 +1,13 @@
+carro,carro,NOUN
 casa,casa,NOUN
-casar,casa,V
-casar,casar,V-INF
 Casa,Casa,PROP
 casa,casinha,NOUN
 casa,casona,NOUN
+casar,casa,V
+casar,casar,V-INF
+ir,foi,V
 menino,menina,NOUN
+menino,menininho,NOUN
 menino,menino,NOUN
 menino,menin�o,NOUN
-menino,menininho,NOUN
-carro,carro,NOUN
\ No newline at end of file
+ser,foi,V


[6/6] opennlp git commit: Merge branch '902' into trunk

Posted by co...@apache.org.
Merge branch '902' into trunk


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/34730d20
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/34730d20
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/34730d20

Branch: refs/heads/trunk
Commit: 34730d2001e1c2f57037cb9caa7ccdf42401eb08
Parents: 75ab495 4f2441b
Author: William Colen <co...@apache.org>
Authored: Wed Dec 28 01:44:39 2016 -0200
Committer: William Colen <co...@apache.org>
Committed: Wed Dec 28 01:44:39 2016 -0200

----------------------------------------------------------------------
 .../src/docbkx/morfologik-addon.out.xml         |   0
 opennlp-docs/src/docbkx/morfologik-addon.xml    | 153 ++++++++++++
 opennlp-docs/src/docbkx/opennlp.xml             |   1 +
 opennlp-morfologik-addon/pom.xml                |  58 +----
 .../src/main/assembly/bin.xml                   |  91 --------
 .../src/main/assembly/src.xml                   |  39 ----
 .../builder/XMLDictionaryToTableTool.java       |   2 +-
 .../lemmatizer/MorfologikLemmatizer.java        |  86 ++++---
 .../src/main/readme/LICENSE                     | 230 -------------------
 .../src/main/readme/MORFOLOGIK-LICENSE          |  28 ---
 opennlp-morfologik-addon/src/main/readme/NOTICE |  11 -
 .../builder/POSDictionayBuilderTest.java        |  30 ++-
 .../lemmatizer/MorfologikLemmatizerTest.java    |  42 +++-
 .../tagdict/POSTaggerFactoryTest.java           |  28 ++-
 .../src/test/resources/dictionaryWithLemma.dict | Bin 0 -> 223 bytes
 .../src/test/resources/dictionaryWithLemma.txt  |  10 +-
 16 files changed, 284 insertions(+), 525 deletions(-)
----------------------------------------------------------------------



[3/6] opennlp git commit: Removes assembly plug-in from pom.xml

Posted by co...@apache.org.
Removes assembly plug-in from pom.xml

Now that Morfologik add-on is part of the main distribution, we don\u2019t need assembly instructions in pom.xml

See issue OPENNLP-902


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/486b8807
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/486b8807
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/486b8807

Branch: refs/heads/trunk
Commit: 486b880796cdc2ef4876ccaadcbf54a2de6ce968
Parents: 060c99a
Author: William Colen <co...@apache.org>
Authored: Wed Dec 28 01:14:49 2016 -0200
Committer: William Colen <co...@apache.org>
Committed: Wed Dec 28 01:14:49 2016 -0200

----------------------------------------------------------------------
 opennlp-morfologik-addon/pom.xml | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/486b8807/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index d7bd311..d0e6a97 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -14,35 +14,7 @@
 	<name>Apache OpenNLP Morfologik Addon</name>
 
 	<url>http://maven.apache.org</url>
-	<build>
-		<plugins>
-			<plugin>
-				<artifactId>maven-assembly-plugin</artifactId>
-				<executions>
-					<execution>
-						<id>bundle-project-sources</id>
-						<phase>package</phase>
-						<goals>
-							<goal>single</goal>
-						</goals>
-						<configuration>
-							<descriptors>
-								<descriptor>src/main/assembly/bin.xml</descriptor>
-								<descriptor>src/main/assembly/src.xml</descriptor>
-							</descriptors>
-							<!-- Tar package is only compatible with gnu tar,
-							     many file have more than 100 chars.
-							     Right now only javadoc files are too long.
-							 -->
-							 <tarLongFileMode>gnu</tarLongFileMode>
-							 
-							 <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
-						</configuration>
-					</execution>
-				</executions>
-			</plugin>
-		</plugins>
-	</build>
+
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 	</properties>