You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2016/11/09 21:10:55 UTC
[01/16] opennlp git commit: OPENNLP-582 Added morfologik addon.
Thanks to Rodrigo Agerri for providing a patch.
Repository: opennlp
Updated Branches:
refs/heads/trunk 92e541c93 -> 49f8e25a1
OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a patch.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f3e90579
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f3e90579
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f3e90579
Branch: refs/heads/trunk
Commit: f3e90579c5feba71dc4f04adaa4acc5ecc7f72e9
Parents:
Author: J�rn Kottmann <jo...@apache.org>
Authored: Thu Nov 14 21:24:13 2013 +0000
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Thu Nov 14 21:24:13 2013 +0000
----------------------------------------------------------------------
pom.xml | 50 ++++++++++
.../lemmatizer/MorfologikLemmatizer.java | 96 ++++++++++++++++++++
2 files changed, 146 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..67e1eaa
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,50 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>morfologik-addon</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>Morfologik Addon</name>
+
+ <url>http://maven.apache.org</url>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-stemming</artifactId>
+ <version>1.6.0</version>
+ <scope>compile</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
new file mode 100644
index 0000000..99694a5
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+ private IStemmer dictLookup;
+ public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+ "NNP", "NP00000"));
+
+ public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+ IOException {
+ dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+ }
+
+ private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+ List<WordData> wdList = dictLookup.lookup(word);
+ HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+ for (WordData wd : wdList) {
+ List<String> wordLemmaTags = new ArrayList<String>();
+ wordLemmaTags.add(word);
+ wordLemmaTags.add(wd.getTag().toString());
+ dictMap.put(wordLemmaTags, wd.getStem().toString());
+ }
+ return dictMap;
+ }
+
+ private List<String> getDictKeys(String word, String postag) {
+ List<String> keys = new ArrayList<String>();
+ if (constantTags.contains(postag)) {
+ keys.addAll(Arrays.asList(word, postag));
+ } else {
+ keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+ }
+ return keys;
+ }
+
+ private HashMap<List<String>, String> getDictMap(String word, String postag) {
+ HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+
+ if (constantTags.contains(postag)) {
+ dictMap = this.getLemmaTagsDict(word);
+ } else {
+ dictMap = this.getLemmaTagsDict(word.toLowerCase());
+ }
+ return dictMap;
+ }
+
+ public String lemmatize(String word, String postag) {
+ String lemma = null;
+ List<String> keys = this.getDictKeys(word, postag);
+ HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+ // lookup lemma as value of the map
+ String keyValue = dictMap.get(keys);
+ if (keyValue != null) {
+ lemma = keyValue;
+ } else if (keyValue == null && constantTags.contains(postag)) {
+ lemma = word;
+ } else if (keyValue == null && word.toUpperCase() == word) {
+ lemma = word;
+ } else {
+ lemma = word.toLowerCase();
+ }
+ return lemma;
+ }
+}
[15/16] opennlp git commit: OPENNLP-622 Merge branch 'master' of
../opennlp-addons into trunk
Posted by co...@apache.org.
OPENNLP-622 Merge branch 'master' of ../opennlp-addons into trunk
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/9b448044
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/9b448044
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/9b448044
Branch: refs/heads/trunk
Commit: 9b4480446b72f9120283e0da10697657072850f3
Parents: 92e541c 772f31f
Author: William Colen <wi...@gmail.com>
Authored: Wed Nov 9 18:28:46 2016 -0200
Committer: William Colen <wi...@gmail.com>
Committed: Wed Nov 9 18:28:46 2016 -0200
----------------------------------------------------------------------
opennlp-morfologik-addon/bin/morfologik-addon | 20 ++
.../bin/morfologik-addon.bat | 21 ++
opennlp-morfologik-addon/pom.xml | 109 +++++++++
.../src/main/assembly/bin.xml | 91 ++++++++
.../src/main/assembly/src.xml | 39 ++++
.../src/main/bin/morfologik-addon | 35 +++
.../src/main/bin/morfologik-addon.bat | 47 ++++
.../src/main/bin/opennlp-cp | 35 +++
.../builder/MorfologikDictionayBuilder.java | 103 +++++++++
.../java/opennlp/morfologik/cmdline/CLI.java | 164 +++++++++++++
.../MorfologikDictionaryBuilderParams.java | 57 +++++
.../MorfologikDictionaryBuilderTool.java | 62 +++++
.../builder/XMLDictionaryToTableParams.java | 45 ++++
.../builder/XMLDictionaryToTableTool.java | 127 ++++++++++
.../lemmatizer/MorfologikLemmatizer.java | 96 ++++++++
.../tagdict/MorfologikPOSTaggerFactory.java | 170 ++++++++++++++
.../tagdict/MorfologikTagDictionary.java | 90 ++++++++
.../opennlp/morfologik/util/MorfologikUtil.java | 36 +++
.../src/main/readme/LICENSE | 230 +++++++++++++++++++
.../src/main/readme/MORFOLOGIK-LICENSE | 28 +++
opennlp-morfologik-addon/src/main/readme/NOTICE | 11 +
.../builder/POSDictionayBuilderTest.java | 58 +++++
.../lemmatizer/MorfologikLemmatizerTest.java | 35 +++
.../tagdict/MorfologikTagDictionaryTest.java | 78 +++++++
.../tagdict/POSTaggerFactoryTest.java | 88 +++++++
.../src/test/resources/AnnotatedSentences.txt | 136 +++++++++++
.../src/test/resources/dictionaryWithLemma.info | 15 ++
.../src/test/resources/dictionaryWithLemma.txt | 11 +
28 files changed, 2037 insertions(+)
----------------------------------------------------------------------
[04/16] opennlp git commit: OPENNLP-622 Refactored to remove usage of
main methods of Morfologik.
Posted by co...@apache.org.
OPENNLP-622 Refactored to remove usage of main methods of Morfologik.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/1314887f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/1314887f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/1314887f
Branch: refs/heads/trunk
Commit: 1314887fe657f21e1213788fd6084a485781f2f1
Parents: 15c3fb7
Author: William Colen <co...@apache.org>
Authored: Thu Jul 7 05:19:18 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 7 05:19:18 2016 +0000
----------------------------------------------------------------------
.../builder/MorfologikDictionayBuilder.java | 144 ++++++-------------
.../MorfologikDictionaryBuilderParams.java | 37 +++--
.../MorfologikDictionaryBuilderTool.java | 17 +--
.../lemmatizer/MorfologikLemmatizer.java | 8 +-
.../tagdict/MorfologikPOSTaggerFactory.java | 14 +-
.../builder/POSDictionayBuilderTest.java | 67 +++------
.../lemmatizer/MorfologikLemmatizerTest.java | 17 +--
.../tagdict/MorfologikTagDictionaryTest.java | 18 +--
.../tagdict/POSTaggerFactoryTest.java | 108 ++++++++++++++
src/test/resources/AnnotatedSentences.txt | 136 ++++++++++++++++++
src/test/resources/dictionaryWithLemma.info | 15 ++
src/test/resources/dictionaryWithLemma.txt | 21 +--
12 files changed, 386 insertions(+), 216 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
index 0131318..dbbca4d 100644
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -17,21 +17,15 @@
package opennlp.morfologik.builder;
-import java.io.File;
import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
import java.util.Properties;
import morfologik.stemming.DictionaryMetadata;
import morfologik.stemming.EncoderType;
-import morfologik.tools.FSACompile;
-import morfologik.tools.Launcher;
+import morfologik.tools.DictCompile;
/**
* Utility class to build Morfologik dictionaries from a tab separated values
@@ -41,117 +35,69 @@ import morfologik.tools.Launcher;
public class MorfologikDictionayBuilder {
/**
- * Build a Morfologik binary dictionary
- *
- * @param dictInFile
- * the 3 column TSV dictionary file
- * @param dictOutFile
- * where to store the binary Morfologik dictionary
- * @param encoding
- * the encoding to be used while reading and writing
- * @param separator
- * a field separator, the default is '+'. If your tags contains '+'
- * change to something else
- * @param encoderType
- * the Morfologik enconder type
- * @param isUseInfixes
- * if to compact using infixes
+ * Helper to compile a morphological dictionary automaton.
+ *
+ * @param input
+ * The input file (base,inflected,tag). An associated metadata
+ * (*.info) file must exist.
+ * @param overwrite
+ * Overwrite the output file if it exists.
+ * @param validate
+ * Validate input to make sure it makes sense.
+ * @param acceptBom
+ * Accept leading BOM bytes (UTF-8).
+ * @param acceptCr
+ * Accept CR bytes in input sequences (\r).
+ * @param ignoreEmpty
+ * Ignore empty lines in the input.
+ * @return the dictionary path
+ *
* @throws Exception
*/
- public void build(File dictInFile, File dictOutFile, Charset encoding,
- String separator, EncoderType encoderType)
+ public Path build(Path input, boolean overwrite, boolean validate,
+ boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
throws Exception {
- Path propertiesPath = DictionaryMetadata
- .getExpectedMetadataLocation(dictOutFile.toPath());
+
+ DictCompile compiler = new DictCompile(input, overwrite, validate,
+ acceptBom, acceptCr, ignoreEmpty);
+ compiler.call();
+
+
+ Path metadataPath = DictionaryMetadata
+ .getExpectedMetadataLocation(input);
- this.build(dictInFile, dictOutFile, propertiesPath.toFile(), encoding, separator,
- encoderType);
+ return metadataPath.resolveSibling(
+ metadataPath.getFileName().toString().replaceAll(
+ "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
}
/**
- * Build a Morfologik binary dictionary
- *
- * @param dictInFile
- * the 3 column TSV dictionary file
- * @param dictOutFile
- * where to store the binary Morfologik dictionary
- * @param propertiesOutFile
- * where to store the properties of the Morfologik dictionary
- * @param encoding
- * the encoding to be used while reading and writing
- * @param separator
- * a field separator, the default is '+'. If your tags contains '+'
- * change to something else
- * @param isUsePrefixes
- * if to compact using prefixes
- * @param isUseInfixes
- * if to compact using infixes
+ * Helper to compile a morphological dictionary automaton using default
+ * parameters.
+ *
+ * @param input
+ * The input file (base,inflected,tag). An associated metadata
+ * (*.info) file must exist.
+ *
+ * @return the dictionary path
+ *
* @throws Exception
*/
- public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
- Charset encoding, String separator, EncoderType encoderType) throws Exception {
-
- // we need to execute tab2morph followed by fsa_build
-
- File morph = tab2morph(dictInFile, separator, encoderType);
+ public Path build(Path input) throws Exception {
- fsaBuild(morph, dictOutFile);
+ return build(input, true, true, false, false, false);
- morph.delete();
-
- // now we create the properties files using the passed parameters
- createProperties(encoding, separator, encoderType,
- propertiesOutFile);
}
- void createProperties(Charset encoding, String separator,
- EncoderType encoderType, File propertiesFile)
- throws FileNotFoundException, IOException {
+ Properties createProperties(Charset encoding, String separator,
+ EncoderType encoderType) throws FileNotFoundException, IOException {
Properties properties = new Properties();
properties.setProperty("fsa.dict.separator", separator);
properties.setProperty("fsa.dict.encoding", encoding.name());
properties.setProperty("fsa.dict.encoder", encoderType.name());
- OutputStream os = new FileOutputStream(propertiesFile);
- properties.store(os, "Morfologik POS Dictionary properties");
- os.close();
-
- }
+ return properties;
- private void fsaBuild(File morph, File dictOutFile) throws Exception {
- String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
- dictOutFile.getAbsolutePath() };
- FSACompile.main(params);
- // FSABuildTool.main(params);
}
-
- private File tab2morph(File dictInFile, String separator,
- EncoderType encoderType) throws Exception {
-
- // create tab2morph parameters
- List<String> tag2morphParams = new ArrayList<String>();
- tag2morphParams.add("tab2morph");
-
- tag2morphParams.add("--annotation");
- tag2morphParams.add(separator);
-
- tag2morphParams.add("--e");
- tag2morphParams.add(encoderType.name());
-
- tag2morphParams.add("-i");
- tag2morphParams.add(dictInFile.getAbsolutePath());
-
- // we need a temporary file to store the intermediate output
- File tmp = File.createTempFile("tab2morph", ".txt");
- tmp.deleteOnExit();
-
- tag2morphParams.add("-o");
- tag2morphParams.add(tmp.getAbsolutePath());
-
- Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
-
- return tmp;
- }
-
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
index 193599b..5ea2e4f 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -19,7 +19,6 @@ package opennlp.morfologik.cmdline.builder;
import java.io.File;
-import morfologik.stemming.EncoderType;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.params.EncodingParameter;
@@ -29,18 +28,30 @@ import opennlp.tools.cmdline.params.EncodingParameter;
*/
interface MorfologikDictionaryBuilderParams extends EncodingParameter {
- @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+ @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
File getInputFile();
-
- @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
- File getOutputFile();
-
- @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
- @OptionalParameter(defaultValue = "+")
- String getFSADictSeparator();
- @ParameterDescription(valueName = "sep", description = "The type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none]. Details are in Daciuk's paper and in the code. ")
- @OptionalParameter(defaultValue = "prefix")
- EncoderType getEncoderType();
-
+ @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
+ @OptionalParameter(defaultValue="false")
+ Boolean getAcceptBOM();
+
+ @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
+ @OptionalParameter(defaultValue="false")
+ Boolean getAcceptCR();
+
+ @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
+ @OptionalParameter(defaultValue="FSA5")
+ String getFormat();
+
+ @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
+ @OptionalParameter(defaultValue="false")
+ Boolean getIgnoreEmpty();
+
+ @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
+ @OptionalParameter(defaultValue="false")
+ Boolean getOverwrite();
+
+ @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
+ @OptionalParameter(defaultValue="false")
+ Boolean getValidate();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
index 741515e..eb9b51c 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -17,11 +17,10 @@
package opennlp.morfologik.cmdline.builder;
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
import java.io.File;
-import java.nio.charset.Charset;
+import java.nio.file.Path;
+import morfologik.stemming.DictionaryMetadata;
import opennlp.morfologik.builder.MorfologikDictionayBuilder;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -44,18 +43,16 @@ public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
Params params = validateAndParseParams(args, Params.class);
File dictInFile = params.getInputFile();
- File dictOutFile = params.getOutputFile();
- File propertiesFile = getExpectedPropertiesFile(dictOutFile);
- Charset encoding = params.getEncoding();
CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
- CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
- CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+ Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
+ CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
try {
- builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
- params.getFSADictSeparator(), params.getEncoderType());
+ builder.build(dictInFile.toPath(), params.getOverwrite(),
+ params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
+ params.getIgnoreEmpty());
} catch (Exception e) {
throw new TerminateToolException(-1,
"Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
index 99694a5..2090ce5 100644
--- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -18,7 +18,7 @@
package opennlp.morfologik.lemmatizer;
import java.io.IOException;
-import java.net.URL;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -26,11 +26,11 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
public class MorfologikLemmatizer implements DictionaryLemmatizer {
@@ -38,9 +38,9 @@ public class MorfologikLemmatizer implements DictionaryLemmatizer {
public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
"NNP", "NP00000"));
- public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+ public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
IOException {
- dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+ dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
}
private HashMap<List<String>, String> getLemmaTagsDict(String word) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index f022a86..723b1ce 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -54,9 +54,21 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
public MorfologikPOSTaggerFactory() {
}
+ /**
+ * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}.
+ *
+ * @param ngramDictionary a ngramDictionary
+ * @param morfologikDictionary a Morfologik dictionary
+ * @param morfologikDictionaryMetadata the dictionary metadata
+ * @throws IOException invalid Morfologik dictionary
+ */
public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
- TagDictionary posDictionary) {
+ byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException {
super(ngramDictionary, null);
+ this.dictData = morfologikDictionary;
+ this.dictInfo = morfologikDictionaryMetadata;
+
+ this.dict = createMorfologikDictionary(dictData, dictInfo);
}
@Override
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 730025c..0a7ba48 100644
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -18,14 +18,12 @@
package opennlp.morfologik.builder;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Properties;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
import junit.framework.TestCase;
-import morfologik.stemming.EncoderType;
+import morfologik.stemming.DictionaryMetadata;
import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
import org.junit.Test;
@@ -34,56 +32,27 @@ public class POSDictionayBuilderTest extends TestCase {
@Test
public void testBuildDictionary() throws Exception {
- MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
- File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
- "/dictionaryWithLemma.txt").getFile());
-
- File dictOutFile = File.createTempFile(
- POSDictionayBuilderTest.class.getName(), ".dict");
-
- builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
+
+ Path output = createMorfologikDictionary();
- MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
- .toURL());
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
assertNotNull(ml);
}
-
- @Test
- public void testPropertiesCreation() throws Exception {
-
- Charset c = Charset.forName("iso-8859-1");
- String sep = "_";
+
+ public static Path createMorfologikDictionary() throws Exception {
+ Path tabFilePath = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".txt").toPath();
+ Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
- EncoderType encoderType = EncoderType.PREFIX;
- Properties p = createPropertiesHelper(c, sep, encoderType);
-
- assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
- assertEquals(sep, p.getProperty("fsa.dict.separator"));
- assertEquals(encoderType,
- EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+ Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+ "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
+ Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+ "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
- encoderType = EncoderType.SUFFIX;
- p = createPropertiesHelper(c, sep, encoderType);
- assertEquals(encoderType,
- EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
-
- }
-
- private Properties createPropertiesHelper(Charset c, String sep,
- EncoderType encoderType) throws IOException {
MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
- File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
- ".info");
- builder.createProperties(c, sep, encoderType, f);
-
- InputStream is = new FileInputStream(f);
-
- Properties prop = new Properties();
- prop.load(is);
- is.close();
- f.delete();
- return prop;
+
+ return builder.build(tabFilePath);
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 87fc2cc..6b7525e 100644
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -2,11 +2,8 @@ package opennlp.morfologik.lemmatizer;
import static org.junit.Assert.assertEquals;
-import java.io.File;
-import java.nio.charset.Charset;
+import java.nio.file.Path;
-import morfologik.stemming.EncoderType;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
import opennlp.morfologik.builder.POSDictionayBuilderTest;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
@@ -28,17 +25,9 @@ public class MorfologikLemmatizerTest {
private MorfologikLemmatizer createDictionary(boolean caseSensitive)
throws Exception {
- MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
- File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
- "/dictionaryWithLemma.txt").getFile());
+ Path output = POSDictionayBuilderTest.createMorfologikDictionary();
- File dictOutFile = File.createTempFile(
- POSDictionayBuilderTest.class.getName(), ".dict");
-
- builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
-
- MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
- .toURL());
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
return ml;
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
index d605e15..c6c9e04 100644
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -3,16 +3,11 @@ package opennlp.morfologik.tagdict;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
-import java.io.File;
-import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import morfologik.stemming.Dictionary;
-import morfologik.stemming.EncoderType;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.morfologik.tagdict.MorfologikTagDictionary;
import opennlp.tools.postag.TagDictionary;
import org.junit.Test;
@@ -74,17 +69,8 @@ public class MorfologikTagDictionaryTest {
private MorfologikTagDictionary createDictionary(boolean caseSensitive,
List<String> constant) throws Exception {
- MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
- File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
- "/dictionaryWithLemma.txt").getFile());
-
- File dictOutFile = File.createTempFile(
- POSDictionayBuilderTest.class.getName(), ".dict");
-
- builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
-
- MorfologikTagDictionary ml = new MorfologikTagDictionary(
- Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+ Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
+ MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
return ml;
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
new file mode 100644
index 0000000..6c6814b
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -0,0 +1,108 @@
+///*
+// * Licensed to the Apache Software Foundation (ASF) under one or more
+// * contributor license agreements. See the NOTICE file distributed with
+// * this work for additional information regarding copyright ownership.
+// * The ASF licenses this file to You under the Apache License, Version 2.0
+// * (the "License"); you may not use this file except in compliance with
+// * the License. You may obtain a copy of the License at
+// *
+// * http://www.apache.org/licenses/LICENSE-2.0
+// *
+// * Unless required by applicable law or agreed to in writing, software
+// * distributed under the License is distributed on an "AS IS" BASIS,
+// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// * See the License for the specific language governing permissions and
+// * limitations under the License.
+// */
+//
+//package opennlp.morfologik.tagdict;
+//
+//import static org.junit.Assert.assertTrue;
+//
+//import java.io.ByteArrayInputStream;
+//import java.io.ByteArrayOutputStream;
+//import java.io.File;
+//import java.io.IOException;
+//import java.io.InputStream;
+//import java.io.InputStreamReader;
+//import java.nio.charset.Charset;
+//import java.nio.file.Files;
+//import java.nio.file.Path;
+//import java.nio.file.Paths;
+//
+//import morfologik.stemming.DictionaryMetadata;
+//import morfologik.stemming.EncoderType;
+//import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+//import opennlp.morfologik.builder.POSDictionayBuilderTest;
+//import opennlp.tools.dictionary.Dictionary;
+//import opennlp.tools.postag.DefaultPOSSequenceValidator;
+//import opennlp.tools.postag.POSContextGenerator;
+//import opennlp.tools.postag.POSDictionary;
+//import opennlp.tools.postag.POSModel;
+//import opennlp.tools.postag.POSSample;
+//import opennlp.tools.postag.POSTaggerFactory;
+//import opennlp.tools.postag.POSTaggerME;
+//import opennlp.tools.postag.WordTagSampleStream;
+//import opennlp.tools.util.BaseToolFactory;
+//import opennlp.tools.util.InvalidFormatException;
+//import opennlp.tools.util.ObjectStream;
+//import opennlp.tools.util.TrainingParameters;
+//import opennlp.tools.util.model.ModelType;
+//
+//import org.junit.Test;
+//
+///**
+// * Tests for the {@link POSTaggerFactory} class.
+// */
+//public class POSTaggerFactoryTest {
+//
+// private static ObjectStream<POSSample> createSampleStream()
+// throws IOException {
+// InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+// .getResourceAsStream("AnnotatedSentences.txt");
+//
+// return new WordTagSampleStream((new InputStreamReader(in)));
+// }
+//
+// static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+// throws IOException {
+// return POSTaggerME.train("en", createSampleStream(),
+// TrainingParameters.defaultParams(), factory);
+// }
+//
+// @Test
+// public void testPOSTaggerWithCustomFactory() throws Exception {
+//
+// MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+// File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+// "/dictionaryWithLemma.txt").getFile());
+//
+// File dictOutFile = File.createTempFile(
+// POSDictionayBuilderTest.class.getName(), ".dict");
+//
+// builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+",
+// EncoderType.PREFIX);
+//
+// Path dictPath = dictOutFile.toPath();
+// Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath);
+//
+// byte[] dic = Files.readAllBytes(dictPath);
+// byte[] meta = Files.readAllBytes(metaPath);
+//
+// POSModel posModel = trainPOSModel(ModelType.MAXENT,
+// new MorfologikPOSTaggerFactory(null, dic, meta));
+//
+// POSTaggerFactory factory = posModel.getFactory();
+// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
+//
+// ByteArrayOutputStream out = new ByteArrayOutputStream();
+// posModel.serialize(out);
+// ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+//
+// POSModel fromSerialized = new POSModel(in);
+//
+// factory = fromSerialized.getFactory();
+// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
+// }
+//
+//}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/AnnotatedSentences.txt b/src/test/resources/AnnotatedSentences.txt
new file mode 100644
index 0000000..b40be87
--- /dev/null
+++ b/src/test/resources/AnnotatedSentences.txt
@@ -0,0 +1,136 @@
+Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
+I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
+So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
+She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
+I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
+
+Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
+Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
+I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
+As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
+The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
+But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
+It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
+She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
+
+Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
+About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
+That_DT sounds_VBZ good_JJ ._.
+So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
+I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
+Thank_VB you_PRP very_RB much_RB !_.
+
+Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
+I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
+Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
+Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
+
+My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
+We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
+Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
+As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
+But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
+Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
+
+I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
+I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
+I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
+I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
+
+Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
+As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
+The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
+Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
+
+An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
+Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
+She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
+But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
+So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
+It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
+She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
+
+Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
+They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
+Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
+No,_NNP that_DT 's_VBZ faked_VBN ._.
+But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
+I_PRP got_VBD them_PRP from_IN Hamburg._NNP
+Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
+It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
+I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
+Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
+Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
+I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
+His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
+
+Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
+My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
+How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
+His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
+I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
+I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
+Is_VBZ that_DT right_NN ?_.
+Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
+So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
+Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
+Bye_NNP !_.
+
+On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
+The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
+Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
+But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
+Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
+
+Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
+I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
+I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
+An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
+Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
+The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
+He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
+
+Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
+He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
+Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
+Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
+It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
+
+When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
+He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
+One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
+So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
+She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
+Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
+
+On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
+The_DT driver_NN got_VBD badly_RB injured_VBN ._.
+Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
+A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
+He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
+He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
+The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
+Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
+
+Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
+Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
+Yeah_UH for_IN sure_JJ ._.
+How_WRB did_VBD you_PRP know_VB that_DT ?_.
+I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
+Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
+Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
+Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
+But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
+I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
+The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
+I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
+Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
+
+My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
+When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
+My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
+But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
+So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
+Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
+We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
+I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.info b/src/test/resources/dictionaryWithLemma.info
new file mode 100644
index 0000000..ad5fe8d
--- /dev/null
+++ b/src/test/resources/dictionaryWithLemma.info
@@ -0,0 +1,15 @@
+#
+# REQUIRED PROPERTIES
+#
+
+# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
+fsa.dict.separator=,
+
+# The charset in which the input is encoded. UTF-8 is strongly recommended.
+fsa.dict.encoding=UTF-8
+
+# The type of lemma-inflected form encoding compression that precedes automaton
+# construction. Allowed values: [suffix, infix, prefix, none].
+# Details are in Daciuk's paper and in the code.
+# Leave at 'prefix' if not sure.
+fsa.dict.encoder=prefix
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
index 5ac7111..09d39e3 100644
--- a/src/test/resources/dictionaryWithLemma.txt
+++ b/src/test/resources/dictionaryWithLemma.txt
@@ -1,10 +1,11 @@
-casa casa NOUN
-casa casar V
-Casa Casa PROP
-casinha casa NOUN
-casona casa NOUN
-menina menino NOUN
-menino menino NOUN
-menin�o menino NOUN
-menininho menino NOUN
-carro NOUN
+casa,casa,NOUN
+casar,casa,V
+casar,casar,V-INF
+Casa,Casa,PROP
+casa,casinha,NOUN
+casa,casona,NOUN
+menino,menina,NOUN
+menino,menino,NOUN
+menino,menin�o,NOUN
+menino,menininho,NOUN
+carro,carro,NOUN
\ No newline at end of file
[02/16] opennlp git commit: OPENNLP-622 Added code to create
Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a
TagDictionary implementation using Morfologik. Added a POSTaggerFactory to
bundle the Morfologik dictionaries in POS Tagger m
Posted by co...@apache.org.
OPENNLP-622 Added code to create Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a TagDictionary implementation using Morfologik. Added a POSTaggerFactory to bundle the Morfologik dictionaries in POS Tagger models.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/78dd579b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/78dd579b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/78dd579b
Branch: refs/heads/trunk
Commit: 78dd579b0e013b3132caae35afe71113764742e9
Parents: f3e9057
Author: William Colen <co...@apache.org>
Authored: Mon Dec 2 13:23:04 2013 +0000
Committer: William Colen <co...@apache.org>
Committed: Mon Dec 2 13:23:04 2013 +0000
----------------------------------------------------------------------
pom.xml | 19 +-
.../builder/MorfologikDictionayBuilder.java | 163 ++++++++++++++++
.../java/opennlp/morfologik/cmdline/CLI.java | 164 +++++++++++++++++
.../MorfologikDictionaryBuilderParams.java | 49 +++++
.../MorfologikDictionaryBuilderTool.java | 71 +++++++
.../builder/XMLDictionaryToTableParams.java | 36 ++++
.../builder/XMLDictionaryToTableTool.java | 82 +++++++++
.../tagdict/MorfologikPOSTaggerFactory.java | 184 +++++++++++++++++++
.../tagdict/MorfologikTagDictionary.java | 90 +++++++++
.../builder/POSDictionayBuilderTest.java | 101 ++++++++++
.../lemmatizer/MorfologikLemmatizerTest.java | 46 +++++
.../tagdict/MorfologikTagDictionaryTest.java | 92 ++++++++++
src/test/resources/dictionaryWithLemma.txt | 10 +
13 files changed, 1101 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 67e1eaa..51854f6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -33,6 +33,12 @@
<version>1.6.0</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-tools</artifactId>
+ <version>1.6.0</version>
+ <scope>compile</scope>
+ </dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
@@ -40,11 +46,12 @@
<version>1.6.0-SNAPSHOT</version>
</dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>3.8.1</version>
- <scope>test</scope>
- </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+
</dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
new file mode 100644
index 0000000..b8bcfbf
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import morfologik.stemming.Dictionary;
+import morfologik.tools.FSABuildTool;
+import morfologik.tools.Launcher;
+
+/**
+ * Utility class to build Morfologik dictionaries from a tab separated values
+ * file. The first column is the word, the second its lemma and the third a POS
+ * tag. If there is no lemma information leave the second column empty.
+ */
+public class MorfologikDictionayBuilder {
+
+ /**
+ * Build a Morfologik binary dictionary
+ *
+ * @param dictInFile
+ * the 3 column TSV dictionary file
+ * @param dictOutFile
+ * where to store the binary Morfologik dictionary
+ * @param encoding
+ * the encoding to be used while reading and writing
+ * @param separator
+ * a field separator, the default is '+'. If your tags contains '+'
+ * change to something else
+ * @param isUsePrefixes
+ * if to compact using prefixes
+ * @param isUseInfixes
+ * if to compact using infixes
+ * @throws Exception
+ */
+ public void build(File dictInFile, File dictOutFile, Charset encoding,
+ String separator, boolean isUsePrefixes, boolean isUseInfixes)
+ throws Exception {
+
+ File propertiesFile = new File(
+ Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath()));
+ this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator,
+ isUsePrefixes, isUseInfixes);
+ }
+
+ /**
+ * Build a Morfologik binary dictionary
+ *
+ * @param dictInFile
+ * the 3 column TSV dictionary file
+ * @param dictOutFile
+ * where to store the binary Morfologik dictionary
+ * @param propertiesOutFile
+ * where to store the properties of the Morfologik dictionary
+ * @param encoding
+ * the encoding to be used while reading and writing
+ * @param separator
+ * a field separator, the default is '+'. If your tags contains '+'
+ * change to something else
+ * @param isUsePrefixes
+ * if to compact using prefixes
+ * @param isUseInfixes
+ * if to compact using infixes
+ * @throws Exception
+ */
+ public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
+ Charset encoding, String separator, boolean isUsePrefixes,
+ boolean isUseInfixes) throws Exception {
+
+ // we need to execute tab2morph followed by fsa_build
+
+ File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes);
+
+ fsaBuild(morph, dictOutFile);
+
+ morph.delete();
+
+ // now we create the properties files using the passed parameters
+ createProperties(encoding, separator, isUsePrefixes, isUseInfixes,
+ propertiesOutFile);
+ }
+
+ void createProperties(Charset encoding, String separator,
+ boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile)
+ throws FileNotFoundException, IOException {
+
+ Properties properties = new Properties();
+ properties.setProperty("fsa.dict.separator", separator);
+ properties.setProperty("fsa.dict.encoding", encoding.name());
+ properties.setProperty("fsa.dict.uses-prefixes",
+ Boolean.toString(isUsePrefixes));
+ properties.setProperty("fsa.dict.uses-infixes",
+ Boolean.toString(isUseInfixes));
+
+ OutputStream os = new FileOutputStream(propertiesFile);
+ properties.store(os, "Morfologik POS Dictionary properties");
+ os.close();
+
+ }
+
+ private void fsaBuild(File morph, File dictOutFile) throws Exception {
+ String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
+ dictOutFile.getAbsolutePath() };
+ FSABuildTool.main(params);
+ }
+
+ private File tab2morph(File dictInFile, String separator,
+ boolean isUsePrefixes, boolean isUseInfixes) throws Exception {
+
+ // create tab2morph parameters
+ List<String> tag2morphParams = new ArrayList<String>();
+ tag2morphParams.add("tab2morph");
+
+ tag2morphParams.add("--annotation");
+ tag2morphParams.add(separator);
+
+ if (isUsePrefixes) {
+ tag2morphParams.add("-pre");
+ }
+
+ if (isUseInfixes) {
+ tag2morphParams.add("-inf");
+ }
+
+ tag2morphParams.add("-i");
+ tag2morphParams.add(dictInFile.getAbsolutePath());
+
+ // we need a temporary file to store the intermediate output
+ File tmp = File.createTempFile("tab2morph", ".txt");
+ tmp.deleteOnExit();
+
+ tag2morphParams.add("-o");
+ tag2morphParams.add(tmp.getAbsolutePath());
+
+ Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
+
+ return tmp;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java
new file mode 100644
index 0000000..66a5151
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/CLI.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
+import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TypedCmdLineTool;
+import opennlp.tools.util.Version;
+
+public final class CLI {
+
+ public static final String CMD = "opennlp-morfologik-addon";
+
+ private static Map<String, CmdLineTool> toolLookupMap;
+
+ static {
+ toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
+
+ List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+
+ tools.add(new MorfologikDictionaryBuilderTool());
+ tools.add(new XMLDictionaryToTableTool());
+
+ for (CmdLineTool tool : tools) {
+ toolLookupMap.put(tool.getName(), tool);
+ }
+
+ toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
+ }
+
+ /**
+ * @return a set which contains all tool names
+ */
+ public static Set<String> getToolNames() {
+ return toolLookupMap.keySet();
+ }
+
+ private static void usage() {
+ System.out.print("OpenNLP Morfologik Addon "
+ + Version.currentVersion().toString() + ". ");
+ System.out.println("Usage: " + CMD + " TOOL");
+ System.out.println("where TOOL is one of:");
+
+ // distance of tool name from line start
+ int numberOfSpaces = -1;
+ for (String toolName : toolLookupMap.keySet()) {
+ if (toolName.length() > numberOfSpaces) {
+ numberOfSpaces = toolName.length();
+ }
+ }
+ numberOfSpaces = numberOfSpaces + 4;
+
+ for (CmdLineTool tool : toolLookupMap.values()) {
+
+ System.out.print(" " + tool.getName());
+
+ for (int i = 0; i < Math.abs(tool.getName().length()
+ - numberOfSpaces); i++) {
+ System.out.print(" ");
+ }
+
+ System.out.println(tool.getShortDescription());
+ }
+
+ System.out
+ .println("All tools print help when invoked with help parameter");
+ System.out
+ .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
+ }
+
+ public static void main(String[] args) {
+
+ if (args.length == 0) {
+ usage();
+ System.exit(0);
+ }
+
+ String toolArguments[] = new String[args.length - 1];
+ System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+ String toolName = args[0];
+
+ // check for format
+ String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+ int idx = toolName.indexOf(".");
+ if (-1 < idx) {
+ formatName = toolName.substring(idx + 1);
+ toolName = toolName.substring(0, idx);
+ }
+ CmdLineTool tool = toolLookupMap.get(toolName);
+
+ try {
+ if (null == tool) {
+ throw new TerminateToolException(1, "Tool " + toolName
+ + " is not found.");
+ }
+
+ if ((0 == toolArguments.length && tool.hasParams())
+ || 0 < toolArguments.length
+ && "help".equals(toolArguments[0])) {
+ if (tool instanceof TypedCmdLineTool) {
+ System.out.println(((TypedCmdLineTool) tool)
+ .getHelp(formatName));
+ } else if (tool instanceof BasicCmdLineTool) {
+ System.out.println(tool.getHelp());
+ }
+
+ System.exit(0);
+ }
+
+ if (tool instanceof TypedCmdLineTool) {
+ ((TypedCmdLineTool) tool).run(formatName, toolArguments);
+ } else if (tool instanceof BasicCmdLineTool) {
+ if (-1 == idx) {
+ ((BasicCmdLineTool) tool).run(toolArguments);
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName
+ + " does not support formats.");
+ }
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName
+ + " is not supported.");
+ }
+ } catch (TerminateToolException e) {
+
+ if (e.getMessage() != null) {
+ System.err.println(e.getMessage());
+ }
+
+ if (e.getCause() != null) {
+ System.err.println(e.getCause().getMessage());
+ e.getCause().printStackTrace(System.err);
+ }
+
+ System.exit(e.getCode());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
new file mode 100644
index 0000000..0b1e896
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface MorfologikDictionaryBuilderParams extends EncodingParameter {
+
+ @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+ File getInputFile();
+
+ @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
+ File getOutputFile();
+
+ @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
+ @OptionalParameter(defaultValue = "+")
+ String getFSADictSeparator();
+
+ @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getUsesPrefixes();
+
+ @ParameterDescription(valueName = "true|false", description = "Compact using infixes.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getUsesInfixes();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
new file mode 100644
index 0000000..9da7e7d
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+
+public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
+
+ interface Params extends MorfologikDictionaryBuilderParams {
+ }
+
+ public String getShortDescription() {
+ return "builds a binary POS Dictionary using Morfologik";
+ }
+
+ public String getHelp() {
+ return getBasicHelp(Params.class);
+ }
+
+ public void run(String[] args) {
+ Params params = validateAndParseParams(args, Params.class);
+
+ File dictInFile = params.getInputFile();
+ File dictOutFile = params.getOutputFile();
+ File propertiesFile = getExpectedPropertiesFile(dictOutFile);
+ Charset encoding = params.getEncoding();
+
+ CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+ CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+ CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ try {
+ builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
+ params.getFSADictSeparator(), params.getUsesPrefixes(),
+ params.getUsesInfixes());
+ } catch (Exception e) {
+ throw new TerminateToolException(-1,
+ "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
+ }
+
+ }
+
+ private File getExpectedPropertiesFile(File dictFile) {
+ return new File(Dictionary.getExpectedFeaturesName(dictFile
+ .getAbsolutePath()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
new file mode 100644
index 0000000..b88cc5d
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface XMLDictionaryToTableParams extends EncodingParameter {
+
+ @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
+ File getInputFile();
+
+ @ParameterDescription(valueName = "out", description = "Tab separated format.")
+ File getOutputFile();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
new file mode 100644
index 0000000..c87f016
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.util.Iterator;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.postag.POSDictionary;
+
+public class XMLDictionaryToTableTool extends BasicCmdLineTool {
+
+ interface Params extends XMLDictionaryToTableParams {
+ }
+
+ public String getShortDescription() {
+ return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+ }
+
+ public String getHelp() {
+ return getBasicHelp(Params.class);
+ }
+
+ public void run(String[] args) {
+ Params params = validateAndParseParams(args, Params.class);
+
+ File dictInFile = params.getInputFile();
+ File dictOutFile = params.getOutputFile();
+ Charset encoding = params.getEncoding();
+
+ CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+ CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+
+ POSDictionary tagDictionary = null;
+ try {
+ tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "Error while loading XML POS Dictionay: " + e.getMessage(), e);
+ }
+ Iterator<String> iterator = tagDictionary.iterator();
+
+ try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
+ encoding)) {
+ while (iterator.hasNext()) {
+ String word = iterator.next();
+ String wordAndLemma = word + "\t\t"; // lemma is empty
+ for (String tag : tagDictionary.getTags(word)) {
+ writer.write(wordAndLemma + tag);
+ writer.newLine();
+ }
+ }
+ writer.close();
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Error while writing output: "
+ + e.getMessage(), e);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
new file mode 100644
index 0000000..9b74ae5
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Map;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
+
+public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
+
+ private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
+ private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
+
+ private static final String MORFOLOGIK_POSDICT = "tagdict."
+ + MORFOLOGIK_POSDICT_SUF;
+ private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+ + MORFOLOGIK_DICT_INFO_SUF;
+
+ private TagDictionary dict;
+
+ private byte[] dictInfo;
+ private byte[] dictData;
+
+ public MorfologikPOSTaggerFactory() {
+ }
+
+ public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
+ TagDictionary posDictionary) {
+ super(ngramDictionary, null);
+ }
+
+ @Override
+ protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
+ super.init(ngramDictionary, null);
+ this.dict = posDictionary;
+
+ // get the dictionary path
+ String path = System.getProperty("morfologik.dict");
+ if (path == null) {
+ throw new IllegalArgumentException(
+ "The property fsa.dict is missing! -Dmorfologik.dict=path");
+ }
+
+ // now we try to load it...
+ try {
+ this.dictData = Files.readAllBytes(Paths.get(path));
+ this.dictInfo = Files.readAllBytes(Paths
+ .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path)));
+
+ this.dict = createMorfologikDictionary(dictData, dictInfo);
+
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException(
+ "The file is not a Morfologik dictionary!", e);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(
+ "Could not open the Morfologik dictionary or the .info file", e);
+ }
+ }
+
+ @Override
+ public TagDictionary getTagDictionary() {
+ if (this.dict == null) {
+
+ if (artifactProvider != null) {
+ Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
+ if (obj != null) {
+ byte[] data = (byte[]) artifactProvider
+ .getArtifact(MORFOLOGIK_POSDICT);
+ byte[] info = (byte[]) artifactProvider
+ .getArtifact(MORFOLOGIK_DICT_INFO);
+
+ try {
+ this.dict = createMorfologikDictionary(data, info);
+ } catch (IllegalArgumentException e) {
+ throw new RuntimeException(
+ "Could not load the dictionary files to Morfologik.", e);
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "IO error while reading the Morfologik dictionary files.", e);
+ }
+ }
+ }
+ }
+
+ return this.dict;
+ }
+
+ @Override
+ public void setTagDictionary(TagDictionary dictionary) {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createEmptyTagDictionary() {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createTagDictionary(File dictionary)
+ throws InvalidFormatException, FileNotFoundException, IOException {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createTagDictionary(InputStream in)
+ throws InvalidFormatException, IOException {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+ Map<String, ArtifactSerializer> serializers = super
+ .createArtifactSerializersMap();
+
+ serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
+ serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
+
+ return serializers;
+ }
+
+ @Override
+ public Map<String, Object> createArtifactMap() {
+ Map<String, Object> artifactMap = super.createArtifactMap();
+ artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
+ artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
+ return artifactMap;
+ }
+
+ private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
+ throws IOException {
+ morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
+ .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream(
+ info));
+ return new MorfologikTagDictionary(dict);
+ }
+
+ static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+ public byte[] create(InputStream in) throws IOException,
+ InvalidFormatException {
+
+ return ModelUtil.read(in);
+ }
+
+ public void serialize(byte[] artifact, OutputStream out) throws IOException {
+ out.write(artifact);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
new file mode 100644
index 0000000..b34ca2b
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.postag.TagDictionary;
+
+/**
+ * A POS Tagger dictionary implementation based on Morfologik binary
+ * dictionaries
+ */
+public class MorfologikTagDictionary implements TagDictionary {
+
+ private IStemmer dictLookup;
+ private boolean isCaseSensitive;
+
+ /**
+ * Creates a case sensitive {@link MorfologikTagDictionary}
+ *
+ * @param dict
+ * a Morfologik FSA dictionary
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ * @throws IOException
+ * could not read dictionary from dictURL
+ */
+ public MorfologikTagDictionary(Dictionary dict)
+ throws IllegalArgumentException, IOException {
+ this(dict, true);
+ }
+
+ /**
+ * Creates MorfologikLemmatizer
+ *
+ * @param dict
+ * a Morfologik FSA dictionary
+ * @param caseSensitive
+ * if true it performs case sensitive lookup
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ * @throws IOException
+ * could not read dictionary from dictURL
+ */
+ public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
+ throws IllegalArgumentException, IOException {
+ this.dictLookup = new DictionaryLookup(dict);
+ this.isCaseSensitive = caseSensitive;
+ }
+
+ @Override
+ public String[] getTags(String word) {
+ if (!isCaseSensitive) {
+ word = word.toLowerCase();
+ }
+
+ List<WordData> data = dictLookup.lookup(word);
+ if (data != null && data.size() > 0) {
+ List<String> tags = new ArrayList<String>(data.size());
+ for (int i = 0; i < data.size(); i++) {
+ tags.add(data.get(i).getTag().toString());
+ }
+ if (tags.size() > 0)
+ return tags.toArray(new String[tags.size()]);
+ return null;
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
new file mode 100644
index 0000000..16d1dac
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Properties;
+
+import junit.framework.TestCase;
+import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
+
+import org.junit.Test;
+
+public class POSDictionayBuilderTest extends TestCase {
+
+ @Test
+ public void testBuildDictionary() throws Exception {
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+ "/dictionaryWithLemma.txt").getFile());
+
+ File dictOutFile = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".dict");
+
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+ true);
+
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+ .toURL());
+
+ assertNotNull(ml);
+ }
+
+ @Test
+ public void testPropertiesCreation() throws Exception {
+
+ Charset c = Charset.forName("iso-8859-1");
+ String sep = "_";
+ boolean pref = true;
+ boolean inf = true;
+ Properties p = createPropertiesHelper(c, sep, pref, inf);
+
+ assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
+ assertEquals(sep, p.getProperty("fsa.dict.separator"));
+ assertEquals(pref,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+ assertEquals(inf,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+ pref = false;
+ inf = true;
+ p = createPropertiesHelper(c, sep, pref, inf);
+ assertEquals(pref,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+ assertEquals(inf,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+ pref = true;
+ inf = false;
+ p = createPropertiesHelper(c, sep, pref, inf);
+ assertEquals(pref,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+ assertEquals(inf,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+ }
+
+ private Properties createPropertiesHelper(Charset c, String sep,
+ boolean pref, boolean inf) throws IOException {
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
+ ".info");
+ builder.createProperties(c, sep, pref, inf, f);
+
+ InputStream is = new FileInputStream(f);
+
+ Properties prop = new Properties();
+ prop.load(is);
+ is.close();
+ f.delete();
+ return prop;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
new file mode 100644
index 0000000..6fd6ec1
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -0,0 +1,46 @@
+package opennlp.morfologik.lemmatizer;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+import org.junit.Test;
+
+public class MorfologikLemmatizerTest {
+
+ @Test
+ public void testLemmatizeInsensitive() throws Exception {
+ DictionaryLemmatizer dict = createDictionary(false);
+
+ assertEquals("casar", dict.lemmatize("casa", "V"));
+ assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+
+ assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+
+ }
+
+ private MorfologikLemmatizer createDictionary(boolean caseSensitive)
+ throws Exception {
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+ "/dictionaryWithLemma.txt").getFile());
+
+ File dictOutFile = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".dict");
+
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+ true);
+
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+ .toURL());
+
+ return ml;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
new file mode 100644
index 0000000..def97b6
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -0,0 +1,92 @@
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.morfologik.tagdict.MorfologikTagDictionary;
+import opennlp.tools.postag.TagDictionary;
+
+import org.junit.Test;
+
+public class MorfologikTagDictionaryTest {
+
+ @Test
+ public void testNoLemma() throws Exception {
+ MorfologikTagDictionary dict = createDictionary(false);
+
+ List<String> tags = Arrays.asList(dict.getTags("carro"));
+ assertEquals(1, tags.size());
+ assertTrue(tags.contains("NOUN"));
+
+ }
+
+ @Test
+ public void testPOSDictionaryInsensitive() throws Exception {
+ TagDictionary dict = createDictionary(false);
+
+ List<String> tags = Arrays.asList(dict.getTags("casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ // this is the behavior of case insensitive dictionary
+ // if we search it using case insensitive, Casa as a proper noun
+ // should be lower case in the dictionary
+ tags = Arrays.asList(dict.getTags("Casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ }
+
+ @Test
+ public void testPOSDictionarySensitive() throws Exception {
+ TagDictionary dict = createDictionary(true);
+
+ List<String> tags = Arrays.asList(dict.getTags("casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ // this is the behavior of case insensitive dictionary
+ // if we search it using case insensitive, Casa as a proper noun
+ // should be lower case in the dictionary
+ tags = Arrays.asList(dict.getTags("Casa"));
+ assertEquals(1, tags.size());
+ assertTrue(tags.contains("PROP"));
+
+ }
+
+ private MorfologikTagDictionary createDictionary(boolean caseSensitive)
+ throws Exception {
+ return this.createDictionary(caseSensitive, null);
+ }
+
+ private MorfologikTagDictionary createDictionary(boolean caseSensitive,
+ List<String> constant) throws Exception {
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+ "/dictionaryWithLemma.txt").getFile());
+
+ File dictOutFile = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".dict");
+
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+ true);
+
+ MorfologikTagDictionary ml = new MorfologikTagDictionary(
+ Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+
+ return ml;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
new file mode 100644
index 0000000..5ac7111
--- /dev/null
+++ b/src/test/resources/dictionaryWithLemma.txt
@@ -0,0 +1,10 @@
+casa casa NOUN
+casa casar V
+Casa Casa PROP
+casinha casa NOUN
+casona casa NOUN
+menina menino NOUN
+menino menino NOUN
+menin�o menino NOUN
+menininho menino NOUN
+carro NOUN
[14/16] opennlp git commit: OPENNLP-622 Preparing to migrate
morfologik-addon to main repository
Posted by co...@apache.org.
OPENNLP-622 Preparing to migrate morfologik-addon to main repository
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/772f31ff
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/772f31ff
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/772f31ff
Branch: refs/heads/trunk
Commit: 772f31ffe764afb675670735be556796781bda8d
Parents: 0cced84
Author: William Colen <wi...@gmail.com>
Authored: Wed Nov 9 18:23:28 2016 -0200
Committer: William Colen <wi...@gmail.com>
Committed: Wed Nov 9 18:23:28 2016 -0200
----------------------------------------------------------------------
bin/morfologik-addon | 20 --
bin/morfologik-addon.bat | 21 --
opennlp-morfologik-addon/bin/morfologik-addon | 20 ++
.../bin/morfologik-addon.bat | 21 ++
opennlp-morfologik-addon/pom.xml | 109 +++++++++
.../src/main/assembly/bin.xml | 91 ++++++++
.../src/main/assembly/src.xml | 39 ++++
.../src/main/bin/morfologik-addon | 35 +++
.../src/main/bin/morfologik-addon.bat | 47 ++++
.../src/main/bin/opennlp-cp | 35 +++
.../builder/MorfologikDictionayBuilder.java | 103 +++++++++
.../java/opennlp/morfologik/cmdline/CLI.java | 164 +++++++++++++
.../MorfologikDictionaryBuilderParams.java | 57 +++++
.../MorfologikDictionaryBuilderTool.java | 62 +++++
.../builder/XMLDictionaryToTableParams.java | 45 ++++
.../builder/XMLDictionaryToTableTool.java | 127 ++++++++++
.../lemmatizer/MorfologikLemmatizer.java | 96 ++++++++
.../tagdict/MorfologikPOSTaggerFactory.java | 170 ++++++++++++++
.../tagdict/MorfologikTagDictionary.java | 90 ++++++++
.../opennlp/morfologik/util/MorfologikUtil.java | 36 +++
.../src/main/readme/LICENSE | 230 +++++++++++++++++++
.../src/main/readme/MORFOLOGIK-LICENSE | 28 +++
opennlp-morfologik-addon/src/main/readme/NOTICE | 11 +
.../builder/POSDictionayBuilderTest.java | 58 +++++
.../lemmatizer/MorfologikLemmatizerTest.java | 35 +++
.../tagdict/MorfologikTagDictionaryTest.java | 78 +++++++
.../tagdict/POSTaggerFactoryTest.java | 88 +++++++
.../src/test/resources/AnnotatedSentences.txt | 136 +++++++++++
.../src/test/resources/dictionaryWithLemma.info | 15 ++
.../src/test/resources/dictionaryWithLemma.txt | 11 +
pom.xml | 109 ---------
src/main/assembly/bin.xml | 91 --------
src/main/assembly/src.xml | 39 ----
src/main/bin/morfologik-addon | 35 ---
src/main/bin/morfologik-addon.bat | 47 ----
src/main/bin/opennlp-cp | 35 ---
.../builder/MorfologikDictionayBuilder.java | 103 ---------
.../java/opennlp/morfologik/cmdline/CLI.java | 164 -------------
.../MorfologikDictionaryBuilderParams.java | 57 -----
.../MorfologikDictionaryBuilderTool.java | 62 -----
.../builder/XMLDictionaryToTableParams.java | 45 ----
.../builder/XMLDictionaryToTableTool.java | 127 ----------
.../lemmatizer/MorfologikLemmatizer.java | 96 --------
.../tagdict/MorfologikPOSTaggerFactory.java | 170 --------------
.../tagdict/MorfologikTagDictionary.java | 90 --------
.../opennlp/morfologik/util/MorfologikUtil.java | 36 ---
src/main/readme/LICENSE | 230 -------------------
src/main/readme/MORFOLOGIK-LICENSE | 28 ---
src/main/readme/NOTICE | 11 -
.../builder/POSDictionayBuilderTest.java | 58 -----
.../lemmatizer/MorfologikLemmatizerTest.java | 35 ---
.../tagdict/MorfologikTagDictionaryTest.java | 78 -------
.../tagdict/POSTaggerFactoryTest.java | 88 -------
src/test/resources/AnnotatedSentences.txt | 136 -----------
src/test/resources/dictionaryWithLemma.info | 15 --
src/test/resources/dictionaryWithLemma.txt | 11 -
56 files changed, 2037 insertions(+), 2037 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon b/bin/morfologik-addon
deleted file mode 100755
index ccc635e..0000000
--- a/bin/morfologik-addon
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*"
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon.bat b/bin/morfologik-addon.bat
deleted file mode 100644
index 26a4778..0000000
--- a/bin/morfologik-addon.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-@ECHO OFF
-
-REM # Licensed to the Apache Software Foundation (ASF) under one
-REM # or more contributor license agreements. See the NOTICE file
-REM # distributed with this work for additional information
-REM # regarding copyright ownership. The ASF licenses this file
-REM # to you under the Apache License, Version 2.0 (the
-REM # "License"); you may not use this file except in compliance
-REM # with the License. You may obtain a copy of the License at
-REM #
-REM # http://www.apache.org/licenses/LICENSE-2.0
-REM #
-REM # Unless required by applicable law or agreed to in writing,
-REM # software distributed under the License is distributed on an
-REM #
-REM # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-REM # KIND, either express or implied. See the License for the
-REM # specific language governing permissions and limitations
-REM # under the License.
-
-mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*"
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/bin/morfologik-addon b/opennlp-morfologik-addon/bin/morfologik-addon
new file mode 100755
index 0000000..ccc635e
--- /dev/null
+++ b/opennlp-morfologik-addon/bin/morfologik-addon
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*"
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/bin/morfologik-addon.bat b/opennlp-morfologik-addon/bin/morfologik-addon.bat
new file mode 100644
index 0000000..26a4778
--- /dev/null
+++ b/opennlp-morfologik-addon/bin/morfologik-addon.bat
@@ -0,0 +1,21 @@
+@ECHO OFF
+
+REM # Licensed to the Apache Software Foundation (ASF) under one
+REM # or more contributor license agreements. See the NOTICE file
+REM # distributed with this work for additional information
+REM # regarding copyright ownership. The ASF licenses this file
+REM # to you under the Apache License, Version 2.0 (the
+REM # "License"); you may not use this file except in compliance
+REM # with the License. You may obtain a copy of the License at
+REM #
+REM # http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM # Unless required by applicable law or agreed to in writing,
+REM # software distributed under the License is distributed on an
+REM #
+REM # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM # KIND, either express or implied. See the License for the
+REM # specific language governing permissions and limitations
+REM # under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*"
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
new file mode 100644
index 0000000..56d0e47
--- /dev/null
+++ b/opennlp-morfologik-addon/pom.xml
@@ -0,0 +1,109 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>morfologik-addon</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>Morfologik Addon</name>
+
+ <url>http://maven.apache.org</url>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>bundle-project-sources</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptors>
+ <descriptor>src/main/assembly/bin.xml</descriptor>
+ <descriptor>src/main/assembly/src.xml</descriptor>
+ </descriptors>
+ <!-- Tar package is only compatible with gnu tar,
+ many file have more than 100 chars.
+ Right now only javadoc files are too long.
+ -->
+ <tarLongFileMode>gnu</tarLongFileMode>
+
+ <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-antrun-plugin</artifactId>
+ <version>1.6</version>
+ <executions>
+ <execution>
+ <id>generate checksums for binary artifacts</id>
+ <goals><goal>run</goal></goals>
+ <phase>verify</phase>
+ <configuration>
+ <target>
+ <checksum algorithm="sha1" format="MD5SUM">
+ <fileset dir="${project.build.directory}">
+ <include name="*.zip" />
+ <include name="*.gz" />
+ </fileset>
+ </checksum>
+ <checksum algorithm="md5" format="MD5SUM">
+ <fileset dir="${project.build.directory}">
+ <include name="*.zip" />
+ <include name="*.gz" />
+ </fileset>
+ </checksum>
+ </target>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-stemming</artifactId>
+ <version>2.1.0</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-tools</artifactId>
+ <version>2.1.0</version>
+ <scope>compile</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/assembly/bin.xml b/opennlp-morfologik-addon/src/main/assembly/bin.xml
new file mode 100644
index 0000000..ab4f6da
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/assembly/bin.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<assembly>
+ <id>bin</id>
+ <formats>
+ <format>tar.gz</format>
+ <format>zip</format>
+ <format>dir</format>
+ </formats>
+
+ <includeBaseDirectory>true</includeBaseDirectory>
+ <baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
+
+ <dependencySets>
+ <dependencySet>
+ <scope>runtime</scope>
+ <unpack>false</unpack>
+ <useProjectArtifact>false</useProjectArtifact>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ <outputDirectory>lib</outputDirectory>
+ <useTransitiveDependencies>true</useTransitiveDependencies>
+ </dependencySet>
+ </dependencySets>
+
+ <fileSets>
+ <fileSet>
+ <directory>src/main/readme</directory>
+ <outputDirectory></outputDirectory>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ </fileSet>
+
+ <fileSet>
+ <directory>.</directory>
+ <outputDirectory></outputDirectory>
+ <filtered>true</filtered>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ <includes>
+ <include>README</include>
+ <include>RELEASE_NOTES.html</include>
+ </includes>
+ </fileSet>
+
+ <fileSet>
+ <directory>target</directory>
+ <outputDirectory></outputDirectory>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ <includes>
+ <include>issuesFixed/**</include>
+ </includes>
+ </fileSet>
+
+ <fileSet>
+ <directory>src/main/bin</directory>
+ <fileMode>755</fileMode>
+ <directoryMode>755</directoryMode>
+ <outputDirectory>bin</outputDirectory>
+ </fileSet>
+
+ <fileSet>
+ <directory>target</directory>
+ <outputDirectory>lib</outputDirectory>
+ <includes>
+ <include>morfologik-addon-*.jar</include>
+ </includes>
+ </fileSet>
+
+ </fileSets>
+</assembly>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/assembly/src.xml b/opennlp-morfologik-addon/src/main/assembly/src.xml
new file mode 100644
index 0000000..cdcc9d3
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/assembly/src.xml
@@ -0,0 +1,39 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<assembly>
+ <id>src</id>
+ <formats>
+ <format>tar.gz</format>
+ <format>zip</format>
+ </formats>
+
+ <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
+
+ <fileSets>
+ <fileSet>
+ <directory>../</directory>
+ <outputDirectory></outputDirectory>
+ <excludes>
+ <exclude>**/target/**</exclude>
+ <exclude>**/.*/**</exclude>
+ <exclude>**/pom.xml.releaseBackup</exclude>
+ <exclude>**/release.properties</exclude>
+ </excludes>
+ </fileSet>
+ </fileSets>
+</assembly>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/bin/morfologik-addon b/opennlp-morfologik-addon/src/main/bin/morfologik-addon
new file mode 100755
index 0000000..9b0faf9
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/bin/morfologik-addon
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note: Do not output anything in this script file, any output
+# may be inadvertantly placed in any output files if
+# output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ JAVACMD="$JAVA_HOME/bin/java"
+ else
+ JAVACMD="`which java`"
+ fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat b/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat
new file mode 100644
index 0000000..aeec31f
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat
@@ -0,0 +1,47 @@
+@ECHO off
+
+REM # Licensed to the Apache Software Foundation (ASF) under one
+REM # or more contributor license agreements. See the NOTICE file
+REM # distributed with this work for additional information
+REM # regarding copyright ownership. The ASF licenses this file
+REM # to you under the Apache License, Version 2.0 (the
+REM # "License"); you may not use this file except in compliance
+REM # with the License. You may obtain a copy of the License at
+REM #
+REM # http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM # Unless required by applicable law or agreed to in writing,
+REM # software distributed under the License is distributed on an
+REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM # KIND, either express or implied. See the License for the
+REM # specific language governing permissions and limitations
+REM # under the License.
+
+REM # Note: Do not output anything in this script file, any output
+REM # may be inadvertantly placed in any output files if
+REM # output redirection is used.
+SETLOCAL
+
+IF "%JAVA_CMD%" == "" (
+ IF "%JAVA_HOME%" == "" (
+ SET JAVA_CMD=java
+ ) ELSE (
+ REM # Keep JAVA_HOME to short-name without spaces
+ FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
+ )
+)
+
+REM # Should work with Windows XP and greater. If not, specify the path to where it is installed.
+IF "%OPENNLP_HOME%" == "" (
+ SET OPENNLP_HOME=%~sp0..
+) ELSE (
+ REM # Keep OPENNLP_HOME to short-name without spaces
+ FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
+)
+
+REM # Get the library JAR file name (JIRA OPENNLP-554)
+FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A
+
+%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
+
+ENDLOCAL
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/opennlp-cp
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/bin/opennlp-cp b/opennlp-morfologik-addon/src/main/bin/opennlp-cp
new file mode 100755
index 0000000..dff0d12
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/bin/opennlp-cp
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note: Do not output anything in this script file, any output
+# may be inadvertantly placed in any output files if
+# output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ JAVACMD="$JAVA_HOME/bin/java"
+ else
+ JAVACMD="`which java`"
+ fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
new file mode 100644
index 0000000..dbbca4d
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Path;
+import java.util.Properties;
+
+import morfologik.stemming.DictionaryMetadata;
+import morfologik.stemming.EncoderType;
+import morfologik.tools.DictCompile;
+
+/**
+ * Utility class to build Morfologik dictionaries from a tab separated values
+ * file. The first column is the word, the second its lemma and the third a POS
+ * tag. If there is no lemma information leave the second column empty.
+ */
+public class MorfologikDictionayBuilder {
+
+ /**
+ * Helper to compile a morphological dictionary automaton.
+ *
+ * @param input
+ * The input file (base,inflected,tag). An associated metadata
+ * (*.info) file must exist.
+ * @param overwrite
+ * Overwrite the output file if it exists.
+ * @param validate
+ * Validate input to make sure it makes sense.
+ * @param acceptBom
+ * Accept leading BOM bytes (UTF-8).
+ * @param acceptCr
+ * Accept CR bytes in input sequences (\r).
+ * @param ignoreEmpty
+ * Ignore empty lines in the input.
+ * @return the dictionary path
+ *
+ * @throws Exception
+ */
+ public Path build(Path input, boolean overwrite, boolean validate,
+ boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
+ throws Exception {
+
+ DictCompile compiler = new DictCompile(input, overwrite, validate,
+ acceptBom, acceptCr, ignoreEmpty);
+ compiler.call();
+
+
+ Path metadataPath = DictionaryMetadata
+ .getExpectedMetadataLocation(input);
+
+ return metadataPath.resolveSibling(
+ metadataPath.getFileName().toString().replaceAll(
+ "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
+ }
+
+ /**
+ * Helper to compile a morphological dictionary automaton using default
+ * parameters.
+ *
+ * @param input
+ * The input file (base,inflected,tag). An associated metadata
+ * (*.info) file must exist.
+ *
+ * @return the dictionary path
+ *
+ * @throws Exception
+ */
+ public Path build(Path input) throws Exception {
+
+ return build(input, true, true, false, false, false);
+
+ }
+
+ Properties createProperties(Charset encoding, String separator,
+ EncoderType encoderType) throws FileNotFoundException, IOException {
+
+ Properties properties = new Properties();
+ properties.setProperty("fsa.dict.separator", separator);
+ properties.setProperty("fsa.dict.encoding", encoding.name());
+ properties.setProperty("fsa.dict.encoder", encoderType.name());
+
+ return properties;
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
new file mode 100644
index 0000000..f92d178
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
+import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TypedCmdLineTool;
+import opennlp.tools.util.Version;
+
+public final class CLI {
+
+ public static final String CMD = "opennlp-morfologik-addon";
+
+ private static Map<String, CmdLineTool> toolLookupMap;
+
+ static {
+ toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
+
+ List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+
+ tools.add(new MorfologikDictionaryBuilderTool());
+ tools.add(new XMLDictionaryToTableTool());
+
+ for (CmdLineTool tool : tools) {
+ toolLookupMap.put(tool.getName(), tool);
+ }
+
+ toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
+ }
+
+ /**
+ * @return a set which contains all tool names
+ */
+ public static Set<String> getToolNames() {
+ return toolLookupMap.keySet();
+ }
+
+ private static void usage() {
+ System.out.print("OpenNLP Morfologik Addon "
+ + Version.currentVersion().toString() + ". ");
+ System.out.println("Usage: " + CMD + " TOOL");
+ System.out.println("where TOOL is one of:");
+
+ // distance of tool name from line start
+ int numberOfSpaces = -1;
+ for (String toolName : toolLookupMap.keySet()) {
+ if (toolName.length() > numberOfSpaces) {
+ numberOfSpaces = toolName.length();
+ }
+ }
+ numberOfSpaces = numberOfSpaces + 4;
+
+ for (CmdLineTool tool : toolLookupMap.values()) {
+
+ System.out.print(" " + tool.getName());
+
+ for (int i = 0; i < Math.abs(tool.getName().length()
+ - numberOfSpaces); i++) {
+ System.out.print(" ");
+ }
+
+ System.out.println(tool.getShortDescription());
+ }
+
+ System.out
+ .println("All tools print help when invoked with help parameter");
+ System.out
+ .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
+ }
+
+
+ @SuppressWarnings("rawtypes")
+ public static void main(String[] args) {
+
+ if (args.length == 0) {
+ usage();
+ System.exit(0);
+ }
+
+ String toolArguments[] = new String[args.length -1];
+ System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+ String toolName = args[0];
+
+ //check for format
+ String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+ int idx = toolName.indexOf(".");
+ if (-1 < idx) {
+ formatName = toolName.substring(idx + 1);
+ toolName = toolName.substring(0, idx);
+ }
+ CmdLineTool tool = toolLookupMap.get(toolName);
+
+ try {
+ if (null == tool) {
+ throw new TerminateToolException(1, "Tool " + toolName + " is not found.");
+ }
+
+ if ((0 == toolArguments.length && tool.hasParams()) ||
+ 0 < toolArguments.length && "help".equals(toolArguments[0])) {
+ if (tool instanceof TypedCmdLineTool) {
+ System.out.println(((TypedCmdLineTool) tool).getHelp(formatName));
+ } else if (tool instanceof BasicCmdLineTool) {
+ System.out.println(tool.getHelp());
+ }
+
+ System.exit(0);
+ }
+
+ if (tool instanceof TypedCmdLineTool) {
+ ((TypedCmdLineTool) tool).run(formatName, toolArguments);
+ } else if (tool instanceof BasicCmdLineTool) {
+ if (-1 == idx) {
+ ((BasicCmdLineTool) tool).run(toolArguments);
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName + " does not support formats.");
+ }
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName + " is not supported.");
+ }
+ }
+ catch (TerminateToolException e) {
+
+ if (e.getMessage() != null) {
+ System.err.println(e.getMessage());
+ }
+
+ if (e.getCause() != null) {
+ System.err.println(e.getCause().getMessage());
+ e.getCause().printStackTrace(System.err);
+ }
+
+ System.exit(e.getCode());
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
new file mode 100644
index 0000000..5ea2e4f
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface MorfologikDictionaryBuilderParams extends EncodingParameter {
+
+ @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
+ File getInputFile();
+
+ @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
+ @OptionalParameter(defaultValue="false")
+ Boolean getAcceptBOM();
+
+ @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
+ @OptionalParameter(defaultValue="false")
+ Boolean getAcceptCR();
+
+ @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
+ @OptionalParameter(defaultValue="FSA5")
+ String getFormat();
+
+ @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
+ @OptionalParameter(defaultValue="false")
+ Boolean getIgnoreEmpty();
+
+ @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
+ @OptionalParameter(defaultValue="false")
+ Boolean getOverwrite();
+
+ @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
+ @OptionalParameter(defaultValue="false")
+ Boolean getValidate();
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
new file mode 100644
index 0000000..eb9b51c
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+import java.nio.file.Path;
+
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+
+public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
+
+ interface Params extends MorfologikDictionaryBuilderParams {
+ }
+
+ public String getShortDescription() {
+ return "builds a binary POS Dictionary using Morfologik";
+ }
+
+ public String getHelp() {
+ return getBasicHelp(Params.class);
+ }
+
+ public void run(String[] args) {
+ Params params = validateAndParseParams(args, Params.class);
+
+ File dictInFile = params.getInputFile();
+
+ CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+ Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
+ CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ try {
+ builder.build(dictInFile.toPath(), params.getOverwrite(),
+ params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
+ params.getIgnoreEmpty());
+ } catch (Exception e) {
+ throw new TerminateToolException(-1,
+ "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
new file mode 100644
index 0000000..4ee8cd4
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface XMLDictionaryToTableParams extends EncodingParameter {
+
+ @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
+ File getInputFile();
+
+ @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
+ File getOutputFile();
+
+ @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
+ @OptionalParameter(defaultValue=",")
+ String getSeparator();
+
+ @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
+ @OptionalParameter(defaultValue="prefix")
+ String getEncoder();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
new file mode 100644
index 0000000..0e7f2d5
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.Properties;
+
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.postag.POSDictionary;
+
+public class XMLDictionaryToTableTool extends BasicCmdLineTool {
+
+ interface Params extends XMLDictionaryToTableParams {
+ }
+
+ private String SEPARATOR;
+
+ public String getShortDescription() {
+ return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+ }
+
+ public String getHelp() {
+ return getBasicHelp(Params.class);
+ }
+
+ public void run(String[] args) {
+ Params params = validateAndParseParams(args, Params.class);
+
+ File dictInFile = params.getInputFile();
+ File dictOutFile = params.getOutputFile();
+ Charset encoding = params.getEncoding();
+ SEPARATOR = params.getSeparator();
+
+ CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+ CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+
+ POSDictionary tagDictionary = null;
+ try {
+ tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "Error while loading XML POS Dictionay: " + e.getMessage(), e);
+ }
+ Iterator<String> iterator = tagDictionary.iterator();
+
+ try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
+ encoding)) {
+ while (iterator.hasNext()) {
+ String word = iterator.next();
+ for (String tag : tagDictionary.getTags(word)) {
+ if(valid(word,tag)) {
+ String entry = createEntry(word, tag);
+ writer.write(entry);
+ writer.newLine();
+ }
+ }
+ }
+ writer.close();
+ System.out.println("Created dictionary: " + dictOutFile.toPath());
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Error while writing output: "
+ + e.getMessage(), e);
+ }
+
+ Properties info = new Properties();
+ info.setProperty("fsa.dict.separator", SEPARATOR);
+ info.setProperty("fsa.dict.encoding", params.getEncoding().name());
+ info.setProperty("fsa.dict.encoder", params.getEncoder());
+
+ Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
+
+ try {
+ info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Error while writing metadata output: "
+ + e.getMessage(), e);
+ }
+ System.out.println("Created metadata: " + dictOutFile.toPath());
+
+ }
+
+ private boolean valid(String word, String tag) {
+ if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
+ System.out
+ .println("Warn: invalid entry because contains separator - word: "
+ + word + " tag: " + tag);
+ return false;
+ }
+
+ return true;
+ }
+
+ private String createEntry(String word, String tag) {
+
+ String entry = "" + SEPARATOR +// base
+ word + SEPARATOR +
+ tag;
+
+ return entry;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
new file mode 100644
index 0000000..2090ce5
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+ private IStemmer dictLookup;
+ public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+ "NNP", "NP00000"));
+
+ public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
+ IOException {
+ dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
+ }
+
+ private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+ List<WordData> wdList = dictLookup.lookup(word);
+ HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+ for (WordData wd : wdList) {
+ List<String> wordLemmaTags = new ArrayList<String>();
+ wordLemmaTags.add(word);
+ wordLemmaTags.add(wd.getTag().toString());
+ dictMap.put(wordLemmaTags, wd.getStem().toString());
+ }
+ return dictMap;
+ }
+
+ private List<String> getDictKeys(String word, String postag) {
+ List<String> keys = new ArrayList<String>();
+ if (constantTags.contains(postag)) {
+ keys.addAll(Arrays.asList(word, postag));
+ } else {
+ keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+ }
+ return keys;
+ }
+
+ private HashMap<List<String>, String> getDictMap(String word, String postag) {
+ HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+
+ if (constantTags.contains(postag)) {
+ dictMap = this.getLemmaTagsDict(word);
+ } else {
+ dictMap = this.getLemmaTagsDict(word.toLowerCase());
+ }
+ return dictMap;
+ }
+
+ public String lemmatize(String word, String postag) {
+ String lemma = null;
+ List<String> keys = this.getDictKeys(word, postag);
+ HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+ // lookup lemma as value of the map
+ String keyValue = dictMap.get(keys);
+ if (keyValue != null) {
+ lemma = keyValue;
+ } else if (keyValue == null && constantTags.contains(postag)) {
+ lemma = word;
+ } else if (keyValue == null && word.toUpperCase() == word) {
+ lemma = word;
+ } else {
+ lemma = word.toLowerCase();
+ }
+ return lemma;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
new file mode 100644
index 0000000..93d6c61
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Map;
+
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
+
+public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
+
+ private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
+ private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
+
+ private static final String MORFOLOGIK_POSDICT = "tagdict."
+ + MORFOLOGIK_POSDICT_SUF;
+ private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+ + MORFOLOGIK_DICT_INFO_SUF;
+
+ private TagDictionary dict;
+
+ private byte[] dictInfo;
+ private byte[] dictData;
+
+ public MorfologikPOSTaggerFactory() {
+ }
+
+ public TagDictionary createTagDictionary(File dictionary)
+ throws InvalidFormatException, FileNotFoundException, IOException {
+
+ if(!dictionary.canRead()) {
+ throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
+ }
+
+ Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
+
+ if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
+ throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
+ }
+
+ this.dictData = Files.readAllBytes(dictionary.toPath());
+ this.dictInfo = Files.readAllBytes(dictionaryMeta);
+
+ return createMorfologikDictionary(dictData, dictInfo);
+
+ }
+
+
+ @Override
+ protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
+ super.init(ngramDictionary, null);
+ this.dict = posDictionary;
+ }
+
+ @Override
+ public TagDictionary getTagDictionary() {
+ if (this.dict == null) {
+
+ if (artifactProvider != null) {
+ Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
+ if (obj != null) {
+ byte[] data = (byte[]) artifactProvider
+ .getArtifact(MORFOLOGIK_POSDICT);
+ byte[] info = (byte[]) artifactProvider
+ .getArtifact(MORFOLOGIK_DICT_INFO);
+
+ try {
+ this.dict = createMorfologikDictionary(data, info);
+ } catch (IllegalArgumentException e) {
+ throw new RuntimeException(
+ "Could not load the dictionary files to Morfologik.", e);
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "IO error while reading the Morfologik dictionary files.", e);
+ }
+ }
+ }
+ }
+
+ return this.dict;
+ }
+
+ @Override
+ public void setTagDictionary(TagDictionary dictionary) {
+ this.dict = dictionary;
+ }
+
+ @Override
+ public TagDictionary createEmptyTagDictionary() {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createTagDictionary(InputStream in)
+ throws InvalidFormatException, IOException {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+ Map<String, ArtifactSerializer> serializers = super
+ .createArtifactSerializersMap();
+
+ serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
+ serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
+
+ return serializers;
+ }
+
+ @Override
+ public Map<String, Object> createArtifactMap() {
+ Map<String, Object> artifactMap = super.createArtifactMap();
+ artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
+ artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
+ return artifactMap;
+ }
+
+ private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
+ throws IOException {
+ morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
+ .read(new ByteArrayInputStream(data), new ByteArrayInputStream(
+ info));
+ return new MorfologikTagDictionary(dict);
+ }
+
+ static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+ public byte[] create(InputStream in) throws IOException,
+ InvalidFormatException {
+
+ return ModelUtil.read(in);
+ }
+
+ public void serialize(byte[] artifact, OutputStream out) throws IOException {
+ out.write(artifact);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
new file mode 100644
index 0000000..b34ca2b
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.postag.TagDictionary;
+
+/**
+ * A POS Tagger dictionary implementation based on Morfologik binary
+ * dictionaries
+ */
+public class MorfologikTagDictionary implements TagDictionary {
+
+ private IStemmer dictLookup;
+ private boolean isCaseSensitive;
+
+ /**
+ * Creates a case sensitive {@link MorfologikTagDictionary}
+ *
+ * @param dict
+ * a Morfologik FSA dictionary
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ * @throws IOException
+ * could not read dictionary from dictURL
+ */
+ public MorfologikTagDictionary(Dictionary dict)
+ throws IllegalArgumentException, IOException {
+ this(dict, true);
+ }
+
+ /**
+ * Creates MorfologikLemmatizer
+ *
+ * @param dict
+ * a Morfologik FSA dictionary
+ * @param caseSensitive
+ * if true it performs case sensitive lookup
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ * @throws IOException
+ * could not read dictionary from dictURL
+ */
+ public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
+ throws IllegalArgumentException, IOException {
+ this.dictLookup = new DictionaryLookup(dict);
+ this.isCaseSensitive = caseSensitive;
+ }
+
+ @Override
+ public String[] getTags(String word) {
+ if (!isCaseSensitive) {
+ word = word.toLowerCase();
+ }
+
+ List<WordData> data = dictLookup.lookup(word);
+ if (data != null && data.size() > 0) {
+ List<String> tags = new ArrayList<String>(data.size());
+ for (int i = 0; i < data.size(); i++) {
+ tags.add(data.get(i).getTag().toString());
+ }
+ if (tags.size() > 0)
+ return tags.toArray(new String[tags.size()]);
+ return null;
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
new file mode 100644
index 0000000..bd4d1a4
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.util;
+
+import java.io.File;
+
+import morfologik.stemming.DictionaryMetadata;
+
+public class MorfologikUtil {
+
+ public static File getExpectedPropertiesFile(File dictFile) {
+ return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath())
+ .toFile();
+ }
+
+ public static File getExpectedPropertiesFile(String dictFile) {
+ File f = new File(dictFile);
+ return getExpectedPropertiesFile(f);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/LICENSE b/opennlp-morfologik-addon/src/main/readme/LICENSE
new file mode 100644
index 0000000..576b4cf
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/readme/LICENSE
@@ -0,0 +1,230 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+The following license applies to the Snowball stemmers:
+
+ Copyright (c) 2001, Dr Martin Porter
+ Copyright (c) 2002, Richard Boulton
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holders nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
new file mode 100644
index 0000000..0554010
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of Morfologik nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/NOTICE b/opennlp-morfologik-addon/src/main/readme/NOTICE
new file mode 100644
index 0000000..73fb1d7
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/readme/NOTICE
@@ -0,0 +1,11 @@
+Apache OpenNLP
+Copyright 2010, 2013 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+The snowball stemmers in
+opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
+were developed by Martin Porter and Richard Boulton.
+The full snowball package is available from
+http://snowball.tartarus.org/
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
new file mode 100644
index 0000000..0a7ba48
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+
+import junit.framework.TestCase;
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
+
+import org.junit.Test;
+
+public class POSDictionayBuilderTest extends TestCase {
+
+ @Test
+ public void testBuildDictionary() throws Exception {
+
+ Path output = createMorfologikDictionary();
+
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
+
+ assertNotNull(ml);
+ }
+
+ public static Path createMorfologikDictionary() throws Exception {
+ Path tabFilePath = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".txt").toPath();
+ Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
+
+ Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+ "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
+ Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+ "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+
+ return builder.build(tabFilePath);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
new file mode 100644
index 0000000..6b7525e
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -0,0 +1,35 @@
+package opennlp.morfologik.lemmatizer;
+
+import static org.junit.Assert.assertEquals;
+
+import java.nio.file.Path;
+
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+import org.junit.Test;
+
+public class MorfologikLemmatizerTest {
+
+ @Test
+ public void testLemmatizeInsensitive() throws Exception {
+ DictionaryLemmatizer dict = createDictionary(false);
+
+ assertEquals("casar", dict.lemmatize("casa", "V"));
+ assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+
+ assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+
+ }
+
+ private MorfologikLemmatizer createDictionary(boolean caseSensitive)
+ throws Exception {
+
+ Path output = POSDictionayBuilderTest.createMorfologikDictionary();
+
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
+
+ return ml;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
new file mode 100644
index 0000000..c6c9e04
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -0,0 +1,78 @@
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.postag.TagDictionary;
+
+import org.junit.Test;
+
+public class MorfologikTagDictionaryTest {
+
+ @Test
+ public void testNoLemma() throws Exception {
+ MorfologikTagDictionary dict = createDictionary(false);
+
+ List<String> tags = Arrays.asList(dict.getTags("carro"));
+ assertEquals(1, tags.size());
+ assertTrue(tags.contains("NOUN"));
+
+ }
+
+ @Test
+ public void testPOSDictionaryInsensitive() throws Exception {
+ TagDictionary dict = createDictionary(false);
+
+ List<String> tags = Arrays.asList(dict.getTags("casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ // this is the behavior of case insensitive dictionary
+ // if we search it using case insensitive, Casa as a proper noun
+ // should be lower case in the dictionary
+ tags = Arrays.asList(dict.getTags("Casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ }
+
+ @Test
+ public void testPOSDictionarySensitive() throws Exception {
+ TagDictionary dict = createDictionary(true);
+
+ List<String> tags = Arrays.asList(dict.getTags("casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ // this is the behavior of case insensitive dictionary
+ // if we search it using case insensitive, Casa as a proper noun
+ // should be lower case in the dictionary
+ tags = Arrays.asList(dict.getTags("Casa"));
+ assertEquals(1, tags.size());
+ assertTrue(tags.contains("PROP"));
+
+ }
+
+ private MorfologikTagDictionary createDictionary(boolean caseSensitive)
+ throws Exception {
+ return this.createDictionary(caseSensitive, null);
+ }
+
+ private MorfologikTagDictionary createDictionary(boolean caseSensitive,
+ List<String> constant) throws Exception {
+
+ Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
+ MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
+
+ return ml;
+ }
+
+}
[13/16] opennlp git commit: OPENNLP-622 Preparing to migrate
morfologik-addon to main repository
Posted by co...@apache.org.
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
new file mode 100644
index 0000000..7341a02
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.postag.WordTagSampleStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelType;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link POSTaggerFactory} class.
+ */
+public class POSTaggerFactoryTest {
+
+ private static ObjectStream<POSSample> createSampleStream()
+ throws IOException {
+ InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+ .getResourceAsStream("AnnotatedSentences.txt");
+
+ return new WordTagSampleStream((new InputStreamReader(in)));
+ }
+
+ static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+ throws IOException {
+ return POSTaggerME.train("en", createSampleStream(),
+ TrainingParameters.defaultParams(), factory);
+ }
+
+ @Test
+ public void testPOSTaggerWithCustomFactory() throws Exception {
+
+ Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary();
+ POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory();
+ TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile());
+ inFactory.setTagDictionary(inDict);
+
+ POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory);
+
+ POSTaggerFactory factory = posModel.getFactory();
+ assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+ factory = null;
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ posModel.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ POSModel fromSerialized = new POSModel(in);
+
+ factory = fromSerialized.getFactory();
+ assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+ assertEquals(2, factory.getTagDictionary().getTags("casa").length);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt b/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt
new file mode 100644
index 0000000..b40be87
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt
@@ -0,0 +1,136 @@
+Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
+I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
+So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
+She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
+I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
+
+Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
+Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
+I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
+As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
+The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
+But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
+It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
+She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
+
+Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
+About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
+That_DT sounds_VBZ good_JJ ._.
+So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
+I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
+Thank_VB you_PRP very_RB much_RB !_.
+
+Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
+I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
+Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
+Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
+
+My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
+We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
+Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
+As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
+But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
+Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
+
+I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
+I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
+I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
+I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
+
+Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
+As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
+The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
+Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
+
+An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
+Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
+She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
+But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
+So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
+It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
+She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
+
+Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
+They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
+Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
+No,_NNP that_DT 's_VBZ faked_VBN ._.
+But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
+I_PRP got_VBD them_PRP from_IN Hamburg._NNP
+Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
+It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
+I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
+Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
+Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
+I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
+His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
+
+Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
+My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
+How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
+His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
+I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
+I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
+Is_VBZ that_DT right_NN ?_.
+Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
+So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
+Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
+Bye_NNP !_.
+
+On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
+The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
+Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
+But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
+Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
+
+Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
+I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
+I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
+An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
+Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
+The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
+He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
+
+Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
+He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
+Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
+Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
+It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
+
+When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
+He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
+One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
+So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
+She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
+Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
+
+On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
+The_DT driver_NN got_VBD badly_RB injured_VBN ._.
+Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
+A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
+He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
+He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
+The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
+Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
+
+Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
+Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
+Yeah_UH for_IN sure_JJ ._.
+How_WRB did_VBD you_PRP know_VB that_DT ?_.
+I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
+Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
+Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
+Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
+But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
+I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
+The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
+I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
+Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
+
+My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
+When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
+My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
+But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
+So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
+Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
+We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
+I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info
new file mode 100644
index 0000000..ad5fe8d
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info
@@ -0,0 +1,15 @@
+#
+# REQUIRED PROPERTIES
+#
+
+# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
+fsa.dict.separator=,
+
+# The charset in which the input is encoded. UTF-8 is strongly recommended.
+fsa.dict.encoding=UTF-8
+
+# The type of lemma-inflected form encoding compression that precedes automaton
+# construction. Allowed values: [suffix, infix, prefix, none].
+# Details are in Daciuk's paper and in the code.
+# Leave at 'prefix' if not sure.
+fsa.dict.encoder=prefix
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
new file mode 100644
index 0000000..09d39e3
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
@@ -0,0 +1,11 @@
+casa,casa,NOUN
+casar,casa,V
+casar,casar,V-INF
+Casa,Casa,PROP
+casa,casinha,NOUN
+casa,casona,NOUN
+menino,menina,NOUN
+menino,menino,NOUN
+menino,menin�o,NOUN
+menino,menininho,NOUN
+carro,carro,NOUN
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
deleted file mode 100644
index 56d0e47..0000000
--- a/pom.xml
+++ /dev/null
@@ -1,109 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <groupId>org.apache.opennlp</groupId>
- <artifactId>morfologik-addon</artifactId>
- <version>1.0-SNAPSHOT</version>
- <packaging>jar</packaging>
- <name>Morfologik Addon</name>
-
- <url>http://maven.apache.org</url>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>2.3.2</version>
- <configuration>
- <source>1.7</source>
- <target>1.7</target>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <executions>
- <execution>
- <id>bundle-project-sources</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <descriptors>
- <descriptor>src/main/assembly/bin.xml</descriptor>
- <descriptor>src/main/assembly/src.xml</descriptor>
- </descriptors>
- <!-- Tar package is only compatible with gnu tar,
- many file have more than 100 chars.
- Right now only javadoc files are too long.
- -->
- <tarLongFileMode>gnu</tarLongFileMode>
-
- <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <artifactId>maven-antrun-plugin</artifactId>
- <version>1.6</version>
- <executions>
- <execution>
- <id>generate checksums for binary artifacts</id>
- <goals><goal>run</goal></goals>
- <phase>verify</phase>
- <configuration>
- <target>
- <checksum algorithm="sha1" format="MD5SUM">
- <fileset dir="${project.build.directory}">
- <include name="*.zip" />
- <include name="*.gz" />
- </fileset>
- </checksum>
- <checksum algorithm="md5" format="MD5SUM">
- <fileset dir="${project.build.directory}">
- <include name="*.zip" />
- <include name="*.gz" />
- </fileset>
- </checksum>
- </target>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- <properties>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>org.carrot2</groupId>
- <artifactId>morfologik-stemming</artifactId>
- <version>2.1.0</version>
- <scope>compile</scope>
- </dependency>
- <dependency>
- <groupId>org.carrot2</groupId>
- <artifactId>morfologik-tools</artifactId>
- <version>2.1.0</version>
- <scope>compile</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.opennlp</groupId>
- <artifactId>opennlp-tools</artifactId>
- <version>1.6.0</version>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.8.1</version>
- <scope>test</scope>
- </dependency>
-
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml
deleted file mode 100644
index ab4f6da..0000000
--- a/src/main/assembly/bin.xml
+++ /dev/null
@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<assembly>
- <id>bin</id>
- <formats>
- <format>tar.gz</format>
- <format>zip</format>
- <format>dir</format>
- </formats>
-
- <includeBaseDirectory>true</includeBaseDirectory>
- <baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
-
- <dependencySets>
- <dependencySet>
- <scope>runtime</scope>
- <unpack>false</unpack>
- <useProjectArtifact>false</useProjectArtifact>
- <fileMode>644</fileMode>
- <directoryMode>755</directoryMode>
- <outputDirectory>lib</outputDirectory>
- <useTransitiveDependencies>true</useTransitiveDependencies>
- </dependencySet>
- </dependencySets>
-
- <fileSets>
- <fileSet>
- <directory>src/main/readme</directory>
- <outputDirectory></outputDirectory>
- <fileMode>644</fileMode>
- <directoryMode>755</directoryMode>
- </fileSet>
-
- <fileSet>
- <directory>.</directory>
- <outputDirectory></outputDirectory>
- <filtered>true</filtered>
- <fileMode>644</fileMode>
- <directoryMode>755</directoryMode>
- <includes>
- <include>README</include>
- <include>RELEASE_NOTES.html</include>
- </includes>
- </fileSet>
-
- <fileSet>
- <directory>target</directory>
- <outputDirectory></outputDirectory>
- <fileMode>644</fileMode>
- <directoryMode>755</directoryMode>
- <includes>
- <include>issuesFixed/**</include>
- </includes>
- </fileSet>
-
- <fileSet>
- <directory>src/main/bin</directory>
- <fileMode>755</fileMode>
- <directoryMode>755</directoryMode>
- <outputDirectory>bin</outputDirectory>
- </fileSet>
-
- <fileSet>
- <directory>target</directory>
- <outputDirectory>lib</outputDirectory>
- <includes>
- <include>morfologik-addon-*.jar</include>
- </includes>
- </fileSet>
-
- </fileSets>
-</assembly>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/src.xml b/src/main/assembly/src.xml
deleted file mode 100644
index cdcc9d3..0000000
--- a/src/main/assembly/src.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<assembly>
- <id>src</id>
- <formats>
- <format>tar.gz</format>
- <format>zip</format>
- </formats>
-
- <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
-
- <fileSets>
- <fileSet>
- <directory>../</directory>
- <outputDirectory></outputDirectory>
- <excludes>
- <exclude>**/target/**</exclude>
- <exclude>**/.*/**</exclude>
- <exclude>**/pom.xml.releaseBackup</exclude>
- <exclude>**/release.properties</exclude>
- </excludes>
- </fileSet>
- </fileSets>
-</assembly>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon
deleted file mode 100755
index 9b0faf9..0000000
--- a/src/main/bin/morfologik-addon
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Note: Do not output anything in this script file, any output
-# may be inadvertantly placed in any output files if
-# output redirection is used.
-
-if [ -z "$JAVACMD" ] ; then
- if [ -n "$JAVA_HOME" ] ; then
- JAVACMD="$JAVA_HOME/bin/java"
- else
- JAVACMD="`which java`"
- fi
-fi
-
-# Might fail if $0 is a link
-OPENNLP_HOME=`dirname "$0"`/..
-
-$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat
deleted file mode 100644
index aeec31f..0000000
--- a/src/main/bin/morfologik-addon.bat
+++ /dev/null
@@ -1,47 +0,0 @@
-@ECHO off
-
-REM # Licensed to the Apache Software Foundation (ASF) under one
-REM # or more contributor license agreements. See the NOTICE file
-REM # distributed with this work for additional information
-REM # regarding copyright ownership. The ASF licenses this file
-REM # to you under the Apache License, Version 2.0 (the
-REM # "License"); you may not use this file except in compliance
-REM # with the License. You may obtain a copy of the License at
-REM #
-REM # http://www.apache.org/licenses/LICENSE-2.0
-REM #
-REM # Unless required by applicable law or agreed to in writing,
-REM # software distributed under the License is distributed on an
-REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-REM # KIND, either express or implied. See the License for the
-REM # specific language governing permissions and limitations
-REM # under the License.
-
-REM # Note: Do not output anything in this script file, any output
-REM # may be inadvertantly placed in any output files if
-REM # output redirection is used.
-SETLOCAL
-
-IF "%JAVA_CMD%" == "" (
- IF "%JAVA_HOME%" == "" (
- SET JAVA_CMD=java
- ) ELSE (
- REM # Keep JAVA_HOME to short-name without spaces
- FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
- )
-)
-
-REM # Should work with Windows XP and greater. If not, specify the path to where it is installed.
-IF "%OPENNLP_HOME%" == "" (
- SET OPENNLP_HOME=%~sp0..
-) ELSE (
- REM # Keep OPENNLP_HOME to short-name without spaces
- FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
-)
-
-REM # Get the library JAR file name (JIRA OPENNLP-554)
-FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A
-
-%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
-
-ENDLOCAL
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/opennlp-cp
----------------------------------------------------------------------
diff --git a/src/main/bin/opennlp-cp b/src/main/bin/opennlp-cp
deleted file mode 100755
index dff0d12..0000000
--- a/src/main/bin/opennlp-cp
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Note: Do not output anything in this script file, any output
-# may be inadvertantly placed in any output files if
-# output redirection is used.
-
-if [ -z "$JAVACMD" ] ; then
- if [ -n "$JAVA_HOME" ] ; then
- JAVACMD="$JAVA_HOME/bin/java"
- else
- JAVACMD="`which java`"
- fi
-fi
-
-# Might fail if $0 is a link
-OPENNLP_HOME=`dirname "$0"`/..
-
-$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
deleted file mode 100644
index dbbca4d..0000000
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.builder;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.nio.file.Path;
-import java.util.Properties;
-
-import morfologik.stemming.DictionaryMetadata;
-import morfologik.stemming.EncoderType;
-import morfologik.tools.DictCompile;
-
-/**
- * Utility class to build Morfologik dictionaries from a tab separated values
- * file. The first column is the word, the second its lemma and the third a POS
- * tag. If there is no lemma information leave the second column empty.
- */
-public class MorfologikDictionayBuilder {
-
- /**
- * Helper to compile a morphological dictionary automaton.
- *
- * @param input
- * The input file (base,inflected,tag). An associated metadata
- * (*.info) file must exist.
- * @param overwrite
- * Overwrite the output file if it exists.
- * @param validate
- * Validate input to make sure it makes sense.
- * @param acceptBom
- * Accept leading BOM bytes (UTF-8).
- * @param acceptCr
- * Accept CR bytes in input sequences (\r).
- * @param ignoreEmpty
- * Ignore empty lines in the input.
- * @return the dictionary path
- *
- * @throws Exception
- */
- public Path build(Path input, boolean overwrite, boolean validate,
- boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
- throws Exception {
-
- DictCompile compiler = new DictCompile(input, overwrite, validate,
- acceptBom, acceptCr, ignoreEmpty);
- compiler.call();
-
-
- Path metadataPath = DictionaryMetadata
- .getExpectedMetadataLocation(input);
-
- return metadataPath.resolveSibling(
- metadataPath.getFileName().toString().replaceAll(
- "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
- }
-
- /**
- * Helper to compile a morphological dictionary automaton using default
- * parameters.
- *
- * @param input
- * The input file (base,inflected,tag). An associated metadata
- * (*.info) file must exist.
- *
- * @return the dictionary path
- *
- * @throws Exception
- */
- public Path build(Path input) throws Exception {
-
- return build(input, true, true, false, false, false);
-
- }
-
- Properties createProperties(Charset encoding, String separator,
- EncoderType encoderType) throws FileNotFoundException, IOException {
-
- Properties properties = new Properties();
- properties.setProperty("fsa.dict.separator", separator);
- properties.setProperty("fsa.dict.encoding", encoding.name());
- properties.setProperty("fsa.dict.encoder", encoderType.name());
-
- return properties;
-
- }
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java
deleted file mode 100644
index f92d178..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/CLI.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline;
-
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
-import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
-import opennlp.tools.cmdline.BasicCmdLineTool;
-import opennlp.tools.cmdline.CmdLineTool;
-import opennlp.tools.cmdline.StreamFactoryRegistry;
-import opennlp.tools.cmdline.TerminateToolException;
-import opennlp.tools.cmdline.TypedCmdLineTool;
-import opennlp.tools.util.Version;
-
-public final class CLI {
-
- public static final String CMD = "opennlp-morfologik-addon";
-
- private static Map<String, CmdLineTool> toolLookupMap;
-
- static {
- toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
-
- List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
-
- tools.add(new MorfologikDictionaryBuilderTool());
- tools.add(new XMLDictionaryToTableTool());
-
- for (CmdLineTool tool : tools) {
- toolLookupMap.put(tool.getName(), tool);
- }
-
- toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
- }
-
- /**
- * @return a set which contains all tool names
- */
- public static Set<String> getToolNames() {
- return toolLookupMap.keySet();
- }
-
- private static void usage() {
- System.out.print("OpenNLP Morfologik Addon "
- + Version.currentVersion().toString() + ". ");
- System.out.println("Usage: " + CMD + " TOOL");
- System.out.println("where TOOL is one of:");
-
- // distance of tool name from line start
- int numberOfSpaces = -1;
- for (String toolName : toolLookupMap.keySet()) {
- if (toolName.length() > numberOfSpaces) {
- numberOfSpaces = toolName.length();
- }
- }
- numberOfSpaces = numberOfSpaces + 4;
-
- for (CmdLineTool tool : toolLookupMap.values()) {
-
- System.out.print(" " + tool.getName());
-
- for (int i = 0; i < Math.abs(tool.getName().length()
- - numberOfSpaces); i++) {
- System.out.print(" ");
- }
-
- System.out.println(tool.getShortDescription());
- }
-
- System.out
- .println("All tools print help when invoked with help parameter");
- System.out
- .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
- }
-
-
- @SuppressWarnings("rawtypes")
- public static void main(String[] args) {
-
- if (args.length == 0) {
- usage();
- System.exit(0);
- }
-
- String toolArguments[] = new String[args.length -1];
- System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
-
- String toolName = args[0];
-
- //check for format
- String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
- int idx = toolName.indexOf(".");
- if (-1 < idx) {
- formatName = toolName.substring(idx + 1);
- toolName = toolName.substring(0, idx);
- }
- CmdLineTool tool = toolLookupMap.get(toolName);
-
- try {
- if (null == tool) {
- throw new TerminateToolException(1, "Tool " + toolName + " is not found.");
- }
-
- if ((0 == toolArguments.length && tool.hasParams()) ||
- 0 < toolArguments.length && "help".equals(toolArguments[0])) {
- if (tool instanceof TypedCmdLineTool) {
- System.out.println(((TypedCmdLineTool) tool).getHelp(formatName));
- } else if (tool instanceof BasicCmdLineTool) {
- System.out.println(tool.getHelp());
- }
-
- System.exit(0);
- }
-
- if (tool instanceof TypedCmdLineTool) {
- ((TypedCmdLineTool) tool).run(formatName, toolArguments);
- } else if (tool instanceof BasicCmdLineTool) {
- if (-1 == idx) {
- ((BasicCmdLineTool) tool).run(toolArguments);
- } else {
- throw new TerminateToolException(1, "Tool " + toolName + " does not support formats.");
- }
- } else {
- throw new TerminateToolException(1, "Tool " + toolName + " is not supported.");
- }
- }
- catch (TerminateToolException e) {
-
- if (e.getMessage() != null) {
- System.err.println(e.getMessage());
- }
-
- if (e.getCause() != null) {
- System.err.println(e.getCause().getMessage());
- e.getCause().printStackTrace(System.err);
- }
-
- System.exit(e.getCode());
- }
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
deleted file mode 100644
index 5ea2e4f..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.File;
-
-import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.params.EncodingParameter;
-
-/**
- * Params for Dictionary tools.
- */
-interface MorfologikDictionaryBuilderParams extends EncodingParameter {
-
- @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
- File getInputFile();
-
- @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
- @OptionalParameter(defaultValue="false")
- Boolean getAcceptBOM();
-
- @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
- @OptionalParameter(defaultValue="false")
- Boolean getAcceptCR();
-
- @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
- @OptionalParameter(defaultValue="FSA5")
- String getFormat();
-
- @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
- @OptionalParameter(defaultValue="false")
- Boolean getIgnoreEmpty();
-
- @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
- @OptionalParameter(defaultValue="false")
- Boolean getOverwrite();
-
- @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
- @OptionalParameter(defaultValue="false")
- Boolean getValidate();
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
deleted file mode 100644
index eb9b51c..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.File;
-import java.nio.file.Path;
-
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
-import opennlp.tools.cmdline.BasicCmdLineTool;
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.cmdline.TerminateToolException;
-
-public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
-
- interface Params extends MorfologikDictionaryBuilderParams {
- }
-
- public String getShortDescription() {
- return "builds a binary POS Dictionary using Morfologik";
- }
-
- public String getHelp() {
- return getBasicHelp(Params.class);
- }
-
- public void run(String[] args) {
- Params params = validateAndParseParams(args, Params.class);
-
- File dictInFile = params.getInputFile();
-
- CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
- Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
- CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
-
- MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
- try {
- builder.build(dictInFile.toPath(), params.getOverwrite(),
- params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
- params.getIgnoreEmpty());
- } catch (Exception e) {
- throw new TerminateToolException(-1,
- "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
deleted file mode 100644
index 4ee8cd4..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.File;
-
-import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.params.EncodingParameter;
-
-/**
- * Params for Dictionary tools.
- */
-interface XMLDictionaryToTableParams extends EncodingParameter {
-
- @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
- File getInputFile();
-
- @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
- File getOutputFile();
-
- @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
- @OptionalParameter(defaultValue=",")
- String getSeparator();
-
- @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
- @OptionalParameter(defaultValue="prefix")
- String getEncoder();
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
deleted file mode 100644
index 0e7f2d5..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Iterator;
-import java.util.Properties;
-
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.tools.cmdline.BasicCmdLineTool;
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.cmdline.TerminateToolException;
-import opennlp.tools.postag.POSDictionary;
-
-public class XMLDictionaryToTableTool extends BasicCmdLineTool {
-
- interface Params extends XMLDictionaryToTableParams {
- }
-
- private String SEPARATOR;
-
- public String getShortDescription() {
- return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
- }
-
- public String getHelp() {
- return getBasicHelp(Params.class);
- }
-
- public void run(String[] args) {
- Params params = validateAndParseParams(args, Params.class);
-
- File dictInFile = params.getInputFile();
- File dictOutFile = params.getOutputFile();
- Charset encoding = params.getEncoding();
- SEPARATOR = params.getSeparator();
-
- CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
- CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
-
- POSDictionary tagDictionary = null;
- try {
- tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
- } catch (IOException e) {
- throw new TerminateToolException(-1,
- "Error while loading XML POS Dictionay: " + e.getMessage(), e);
- }
- Iterator<String> iterator = tagDictionary.iterator();
-
- try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
- encoding)) {
- while (iterator.hasNext()) {
- String word = iterator.next();
- for (String tag : tagDictionary.getTags(word)) {
- if(valid(word,tag)) {
- String entry = createEntry(word, tag);
- writer.write(entry);
- writer.newLine();
- }
- }
- }
- writer.close();
- System.out.println("Created dictionary: " + dictOutFile.toPath());
- } catch (IOException e) {
- throw new TerminateToolException(-1, "Error while writing output: "
- + e.getMessage(), e);
- }
-
- Properties info = new Properties();
- info.setProperty("fsa.dict.separator", SEPARATOR);
- info.setProperty("fsa.dict.encoding", params.getEncoding().name());
- info.setProperty("fsa.dict.encoder", params.getEncoder());
-
- Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
-
- try {
- info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
- } catch (IOException e) {
- throw new TerminateToolException(-1, "Error while writing metadata output: "
- + e.getMessage(), e);
- }
- System.out.println("Created metadata: " + dictOutFile.toPath());
-
- }
-
- private boolean valid(String word, String tag) {
- if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
- System.out
- .println("Warn: invalid entry because contains separator - word: "
- + word + " tag: " + tag);
- return false;
- }
-
- return true;
- }
-
- private String createEntry(String word, String tag) {
-
- String entry = "" + SEPARATOR +// base
- word + SEPARATOR +
- tag;
-
- return entry;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
deleted file mode 100644
index 2090ce5..0000000
--- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.lemmatizer;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import morfologik.stemming.Dictionary;
-import morfologik.stemming.DictionaryLookup;
-import morfologik.stemming.IStemmer;
-import morfologik.stemming.WordData;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
-
-public class MorfologikLemmatizer implements DictionaryLemmatizer {
-
- private IStemmer dictLookup;
- public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
- "NNP", "NP00000"));
-
- public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
- IOException {
- dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
- }
-
- private HashMap<List<String>, String> getLemmaTagsDict(String word) {
- List<WordData> wdList = dictLookup.lookup(word);
- HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
- for (WordData wd : wdList) {
- List<String> wordLemmaTags = new ArrayList<String>();
- wordLemmaTags.add(word);
- wordLemmaTags.add(wd.getTag().toString());
- dictMap.put(wordLemmaTags, wd.getStem().toString());
- }
- return dictMap;
- }
-
- private List<String> getDictKeys(String word, String postag) {
- List<String> keys = new ArrayList<String>();
- if (constantTags.contains(postag)) {
- keys.addAll(Arrays.asList(word, postag));
- } else {
- keys.addAll(Arrays.asList(word.toLowerCase(), postag));
- }
- return keys;
- }
-
- private HashMap<List<String>, String> getDictMap(String word, String postag) {
- HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
-
- if (constantTags.contains(postag)) {
- dictMap = this.getLemmaTagsDict(word);
- } else {
- dictMap = this.getLemmaTagsDict(word.toLowerCase());
- }
- return dictMap;
- }
-
- public String lemmatize(String word, String postag) {
- String lemma = null;
- List<String> keys = this.getDictKeys(word, postag);
- HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
- // lookup lemma as value of the map
- String keyValue = dictMap.get(keys);
- if (keyValue != null) {
- lemma = keyValue;
- } else if (keyValue == null && constantTags.contains(postag)) {
- lemma = word;
- } else if (keyValue == null && word.toUpperCase() == word) {
- lemma = word;
- } else {
- lemma = word.toLowerCase();
- }
- return lemma;
- }
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
deleted file mode 100644
index 93d6c61..0000000
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.tagdict;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Map;
-
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.tools.dictionary.Dictionary;
-import opennlp.tools.postag.POSTaggerFactory;
-import opennlp.tools.postag.TagDictionary;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.model.ArtifactSerializer;
-import opennlp.tools.util.model.ModelUtil;
-
-public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
-
- private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
- private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
-
- private static final String MORFOLOGIK_POSDICT = "tagdict."
- + MORFOLOGIK_POSDICT_SUF;
- private static final String MORFOLOGIK_DICT_INFO = "tagdict."
- + MORFOLOGIK_DICT_INFO_SUF;
-
- private TagDictionary dict;
-
- private byte[] dictInfo;
- private byte[] dictData;
-
- public MorfologikPOSTaggerFactory() {
- }
-
- public TagDictionary createTagDictionary(File dictionary)
- throws InvalidFormatException, FileNotFoundException, IOException {
-
- if(!dictionary.canRead()) {
- throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
- }
-
- Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
-
- if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
- throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
- }
-
- this.dictData = Files.readAllBytes(dictionary.toPath());
- this.dictInfo = Files.readAllBytes(dictionaryMeta);
-
- return createMorfologikDictionary(dictData, dictInfo);
-
- }
-
-
- @Override
- protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
- super.init(ngramDictionary, null);
- this.dict = posDictionary;
- }
-
- @Override
- public TagDictionary getTagDictionary() {
- if (this.dict == null) {
-
- if (artifactProvider != null) {
- Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
- if (obj != null) {
- byte[] data = (byte[]) artifactProvider
- .getArtifact(MORFOLOGIK_POSDICT);
- byte[] info = (byte[]) artifactProvider
- .getArtifact(MORFOLOGIK_DICT_INFO);
-
- try {
- this.dict = createMorfologikDictionary(data, info);
- } catch (IllegalArgumentException e) {
- throw new RuntimeException(
- "Could not load the dictionary files to Morfologik.", e);
- } catch (IOException e) {
- throw new RuntimeException(
- "IO error while reading the Morfologik dictionary files.", e);
- }
- }
- }
- }
-
- return this.dict;
- }
-
- @Override
- public void setTagDictionary(TagDictionary dictionary) {
- this.dict = dictionary;
- }
-
- @Override
- public TagDictionary createEmptyTagDictionary() {
- throw new UnsupportedOperationException(
- "Morfologik POS Tagger factory does not support this operation");
- }
-
- @Override
- public TagDictionary createTagDictionary(InputStream in)
- throws InvalidFormatException, IOException {
- throw new UnsupportedOperationException(
- "Morfologik POS Tagger factory does not support this operation");
- }
-
- @Override
- @SuppressWarnings("rawtypes")
- public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
- Map<String, ArtifactSerializer> serializers = super
- .createArtifactSerializersMap();
-
- serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
- serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
-
- return serializers;
- }
-
- @Override
- public Map<String, Object> createArtifactMap() {
- Map<String, Object> artifactMap = super.createArtifactMap();
- artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
- artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
- return artifactMap;
- }
-
- private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
- throws IOException {
- morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
- .read(new ByteArrayInputStream(data), new ByteArrayInputStream(
- info));
- return new MorfologikTagDictionary(dict);
- }
-
- static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
-
- public byte[] create(InputStream in) throws IOException,
- InvalidFormatException {
-
- return ModelUtil.read(in);
- }
-
- public void serialize(byte[] artifact, OutputStream out) throws IOException {
- out.write(artifact);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
deleted file mode 100644
index b34ca2b..0000000
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.tagdict;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import morfologik.stemming.Dictionary;
-import morfologik.stemming.DictionaryLookup;
-import morfologik.stemming.IStemmer;
-import morfologik.stemming.WordData;
-import opennlp.tools.postag.TagDictionary;
-
-/**
- * A POS Tagger dictionary implementation based on Morfologik binary
- * dictionaries
- */
-public class MorfologikTagDictionary implements TagDictionary {
-
- private IStemmer dictLookup;
- private boolean isCaseSensitive;
-
- /**
- * Creates a case sensitive {@link MorfologikTagDictionary}
- *
- * @param dict
- * a Morfologik FSA dictionary
- * @throws IllegalArgumentException
- * if FSA's root node cannot be acquired (dictionary is empty).
- * @throws IOException
- * could not read dictionary from dictURL
- */
- public MorfologikTagDictionary(Dictionary dict)
- throws IllegalArgumentException, IOException {
- this(dict, true);
- }
-
- /**
- * Creates MorfologikLemmatizer
- *
- * @param dict
- * a Morfologik FSA dictionary
- * @param caseSensitive
- * if true it performs case sensitive lookup
- * @throws IllegalArgumentException
- * if FSA's root node cannot be acquired (dictionary is empty).
- * @throws IOException
- * could not read dictionary from dictURL
- */
- public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
- throws IllegalArgumentException, IOException {
- this.dictLookup = new DictionaryLookup(dict);
- this.isCaseSensitive = caseSensitive;
- }
-
- @Override
- public String[] getTags(String word) {
- if (!isCaseSensitive) {
- word = word.toLowerCase();
- }
-
- List<WordData> data = dictLookup.lookup(word);
- if (data != null && data.size() > 0) {
- List<String> tags = new ArrayList<String>(data.size());
- for (int i = 0; i < data.size(); i++) {
- tags.add(data.get(i).getTag().toString());
- }
- if (tags.size() > 0)
- return tags.toArray(new String[tags.size()]);
- return null;
- }
- return null;
- }
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
deleted file mode 100644
index bd4d1a4..0000000
--- a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.util;
-
-import java.io.File;
-
-import morfologik.stemming.DictionaryMetadata;
-
-public class MorfologikUtil {
-
- public static File getExpectedPropertiesFile(File dictFile) {
- return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath())
- .toFile();
- }
-
- public static File getExpectedPropertiesFile(String dictFile) {
- File f = new File(dictFile);
- return getExpectedPropertiesFile(f);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/LICENSE b/src/main/readme/LICENSE
deleted file mode 100644
index 576b4cf..0000000
--- a/src/main/readme/LICENSE
+++ /dev/null
@@ -1,230 +0,0 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-The following license applies to the Snowball stemmers:
-
- Copyright (c) 2001, Dr Martin Porter
- Copyright (c) 2002, Richard Boulton
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * Neither the name of the copyright holders nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/MORFOLOGIK-LICENSE b/src/main/readme/MORFOLOGIK-LICENSE
deleted file mode 100644
index 0554010..0000000
--- a/src/main/readme/MORFOLOGIK-LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of Morfologik nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/src/main/readme/NOTICE b/src/main/readme/NOTICE
deleted file mode 100644
index 73fb1d7..0000000
--- a/src/main/readme/NOTICE
+++ /dev/null
@@ -1,11 +0,0 @@
-Apache OpenNLP
-Copyright 2010, 2013 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-The snowball stemmers in
-opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
-were developed by Martin Porter and Richard Boulton.
-The full snowball package is available from
-http://snowball.tartarus.org/
[16/16] opennlp git commit: OPENNLP-622 Added Morfologik to the root
pom.xml / Changed artifact id to opennlp-morfologik-addon.
Posted by co...@apache.org.
OPENNLP-622 Added Morfologik to the root pom.xml / Changed artifact id to opennlp-morfologik-addon.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/49f8e25a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/49f8e25a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/49f8e25a
Branch: refs/heads/trunk
Commit: 49f8e25a1443b7338f8161a2e9c8e333d7a43d2b
Parents: 9b44804
Author: William Colen <wi...@gmail.com>
Authored: Wed Nov 9 19:10:26 2016 -0200
Committer: William Colen <wi...@gmail.com>
Committed: Wed Nov 9 19:10:26 2016 -0200
----------------------------------------------------------------------
opennlp-morfologik-addon/pom.xml | 4 ++--
pom.xml | 1 +
2 files changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/49f8e25a/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index 56d0e47..71d2c53 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -3,10 +3,10 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.opennlp</groupId>
- <artifactId>morfologik-addon</artifactId>
+ <artifactId>opennlp-morfologik-addon</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
- <name>Morfologik Addon</name>
+ <name>Apache OpenNLP Morfologik Addon</name>
<url>http://maven.apache.org</url>
<build>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/49f8e25a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 34e53a1..91e5043 100644
--- a/pom.xml
+++ b/pom.xml
@@ -218,6 +218,7 @@
<module>opennlp-tools</module>
<module>opennlp-uima</module>
<module>opennlp-brat-annotator</module>
+ <module>opennlp-morfologik-addon</module>
<module>opennlp-docs</module>
<module>opennlp-distr</module>
</modules>
[03/16] opennlp git commit: OPENNLP-622 Updated to OpenNLP 1.6.0 and
Morfologik 2.1.0
Posted by co...@apache.org.
OPENNLP-622 Updated to OpenNLP 1.6.0 and Morfologik 2.1.0
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/15c3fb72
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/15c3fb72
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/15c3fb72
Branch: refs/heads/trunk
Commit: 15c3fb720fcde96328e5c20e6a8994b7d4f7abc8
Parents: 78dd579
Author: William Colen <co...@apache.org>
Authored: Wed Jul 6 21:22:38 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Wed Jul 6 21:22:38 2016 +0000
----------------------------------------------------------------------
pom.xml | 6 +-
.../builder/MorfologikDictionayBuilder.java | 52 ++++----
.../java/opennlp/morfologik/cmdline/CLI.java | 128 +++++++++----------
.../MorfologikDictionaryBuilderParams.java | 13 +-
.../MorfologikDictionaryBuilderTool.java | 12 +-
.../tagdict/MorfologikPOSTaggerFactory.java | 8 +-
.../opennlp/morfologik/util/MorfologikUtil.java | 36 ++++++
.../builder/POSDictionayBuilderTest.java | 42 +++---
.../lemmatizer/MorfologikLemmatizerTest.java | 4 +-
.../tagdict/MorfologikTagDictionaryTest.java | 4 +-
10 files changed, 158 insertions(+), 147 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 51854f6..60f201e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -30,20 +30,20 @@
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>morfologik-stemming</artifactId>
- <version>1.6.0</version>
+ <version>2.1.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>morfologik-tools</artifactId>
- <version>1.6.0</version>
+ <version>2.1.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
- <version>1.6.0-SNAPSHOT</version>
+ <version>1.6.0</version>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
index b8bcfbf..0131318 100644
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -23,12 +23,14 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
-import morfologik.stemming.Dictionary;
-import morfologik.tools.FSABuildTool;
+import morfologik.stemming.DictionaryMetadata;
+import morfologik.stemming.EncoderType;
+import morfologik.tools.FSACompile;
import morfologik.tools.Launcher;
/**
@@ -50,20 +52,20 @@ public class MorfologikDictionayBuilder {
* @param separator
* a field separator, the default is '+'. If your tags contains '+'
* change to something else
- * @param isUsePrefixes
- * if to compact using prefixes
+ * @param encoderType
+ * the Morfologik enconder type
* @param isUseInfixes
* if to compact using infixes
* @throws Exception
*/
public void build(File dictInFile, File dictOutFile, Charset encoding,
- String separator, boolean isUsePrefixes, boolean isUseInfixes)
+ String separator, EncoderType encoderType)
throws Exception {
-
- File propertiesFile = new File(
- Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath()));
- this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator,
- isUsePrefixes, isUseInfixes);
+ Path propertiesPath = DictionaryMetadata
+ .getExpectedMetadataLocation(dictOutFile.toPath());
+
+ this.build(dictInFile, dictOutFile, propertiesPath.toFile(), encoding, separator,
+ encoderType);
}
/**
@@ -87,33 +89,29 @@ public class MorfologikDictionayBuilder {
* @throws Exception
*/
public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
- Charset encoding, String separator, boolean isUsePrefixes,
- boolean isUseInfixes) throws Exception {
+ Charset encoding, String separator, EncoderType encoderType) throws Exception {
// we need to execute tab2morph followed by fsa_build
- File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes);
+ File morph = tab2morph(dictInFile, separator, encoderType);
fsaBuild(morph, dictOutFile);
morph.delete();
// now we create the properties files using the passed parameters
- createProperties(encoding, separator, isUsePrefixes, isUseInfixes,
+ createProperties(encoding, separator, encoderType,
propertiesOutFile);
}
void createProperties(Charset encoding, String separator,
- boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile)
+ EncoderType encoderType, File propertiesFile)
throws FileNotFoundException, IOException {
Properties properties = new Properties();
properties.setProperty("fsa.dict.separator", separator);
properties.setProperty("fsa.dict.encoding", encoding.name());
- properties.setProperty("fsa.dict.uses-prefixes",
- Boolean.toString(isUsePrefixes));
- properties.setProperty("fsa.dict.uses-infixes",
- Boolean.toString(isUseInfixes));
+ properties.setProperty("fsa.dict.encoder", encoderType.name());
OutputStream os = new FileOutputStream(propertiesFile);
properties.store(os, "Morfologik POS Dictionary properties");
@@ -124,11 +122,12 @@ public class MorfologikDictionayBuilder {
private void fsaBuild(File morph, File dictOutFile) throws Exception {
String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
dictOutFile.getAbsolutePath() };
- FSABuildTool.main(params);
+ FSACompile.main(params);
+ // FSABuildTool.main(params);
}
private File tab2morph(File dictInFile, String separator,
- boolean isUsePrefixes, boolean isUseInfixes) throws Exception {
+ EncoderType encoderType) throws Exception {
// create tab2morph parameters
List<String> tag2morphParams = new ArrayList<String>();
@@ -136,14 +135,9 @@ public class MorfologikDictionayBuilder {
tag2morphParams.add("--annotation");
tag2morphParams.add(separator);
-
- if (isUsePrefixes) {
- tag2morphParams.add("-pre");
- }
-
- if (isUseInfixes) {
- tag2morphParams.add("-inf");
- }
+
+ tag2morphParams.add("--e");
+ tag2morphParams.add(encoderType.name());
tag2morphParams.add("-i");
tag2morphParams.add(dictInFile.getAbsolutePath());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java
index 66a5151..f92d178 100644
--- a/src/main/java/opennlp/morfologik/cmdline/CLI.java
+++ b/src/main/java/opennlp/morfologik/cmdline/CLI.java
@@ -94,71 +94,71 @@ public final class CLI {
.println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
}
- public static void main(String[] args) {
- if (args.length == 0) {
- usage();
- System.exit(0);
- }
-
- String toolArguments[] = new String[args.length - 1];
- System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
-
- String toolName = args[0];
-
- // check for format
- String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
- int idx = toolName.indexOf(".");
- if (-1 < idx) {
- formatName = toolName.substring(idx + 1);
- toolName = toolName.substring(0, idx);
- }
- CmdLineTool tool = toolLookupMap.get(toolName);
-
- try {
- if (null == tool) {
- throw new TerminateToolException(1, "Tool " + toolName
- + " is not found.");
- }
-
- if ((0 == toolArguments.length && tool.hasParams())
- || 0 < toolArguments.length
- && "help".equals(toolArguments[0])) {
- if (tool instanceof TypedCmdLineTool) {
- System.out.println(((TypedCmdLineTool) tool)
- .getHelp(formatName));
- } else if (tool instanceof BasicCmdLineTool) {
- System.out.println(tool.getHelp());
- }
-
- System.exit(0);
- }
-
- if (tool instanceof TypedCmdLineTool) {
- ((TypedCmdLineTool) tool).run(formatName, toolArguments);
- } else if (tool instanceof BasicCmdLineTool) {
- if (-1 == idx) {
- ((BasicCmdLineTool) tool).run(toolArguments);
- } else {
- throw new TerminateToolException(1, "Tool " + toolName
- + " does not support formats.");
- }
- } else {
- throw new TerminateToolException(1, "Tool " + toolName
- + " is not supported.");
- }
- } catch (TerminateToolException e) {
-
- if (e.getMessage() != null) {
- System.err.println(e.getMessage());
- }
+ @SuppressWarnings("rawtypes")
+ public static void main(String[] args) {
+
+ if (args.length == 0) {
+ usage();
+ System.exit(0);
+ }
+
+ String toolArguments[] = new String[args.length -1];
+ System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+ String toolName = args[0];
+
+ //check for format
+ String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+ int idx = toolName.indexOf(".");
+ if (-1 < idx) {
+ formatName = toolName.substring(idx + 1);
+ toolName = toolName.substring(0, idx);
+ }
+ CmdLineTool tool = toolLookupMap.get(toolName);
+
+ try {
+ if (null == tool) {
+ throw new TerminateToolException(1, "Tool " + toolName + " is not found.");
+ }
+
+ if ((0 == toolArguments.length && tool.hasParams()) ||
+ 0 < toolArguments.length && "help".equals(toolArguments[0])) {
+ if (tool instanceof TypedCmdLineTool) {
+ System.out.println(((TypedCmdLineTool) tool).getHelp(formatName));
+ } else if (tool instanceof BasicCmdLineTool) {
+ System.out.println(tool.getHelp());
+ }
+
+ System.exit(0);
+ }
+
+ if (tool instanceof TypedCmdLineTool) {
+ ((TypedCmdLineTool) tool).run(formatName, toolArguments);
+ } else if (tool instanceof BasicCmdLineTool) {
+ if (-1 == idx) {
+ ((BasicCmdLineTool) tool).run(toolArguments);
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName + " does not support formats.");
+ }
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName + " is not supported.");
+ }
+ }
+ catch (TerminateToolException e) {
+
+ if (e.getMessage() != null) {
+ System.err.println(e.getMessage());
+ }
+
+ if (e.getCause() != null) {
+ System.err.println(e.getCause().getMessage());
+ e.getCause().printStackTrace(System.err);
+ }
+
+ System.exit(e.getCode());
+ }
+ }
- if (e.getCause() != null) {
- System.err.println(e.getCause().getMessage());
- e.getCause().printStackTrace(System.err);
- }
- System.exit(e.getCode());
- }
- }
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
index 0b1e896..193599b 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder;
import java.io.File;
+import morfologik.stemming.EncoderType;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.params.EncodingParameter;
@@ -37,13 +38,9 @@ interface MorfologikDictionaryBuilderParams extends EncodingParameter {
@ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
@OptionalParameter(defaultValue = "+")
String getFSADictSeparator();
-
- @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.")
- @OptionalParameter(defaultValue = "true")
- Boolean getUsesPrefixes();
-
- @ParameterDescription(valueName = "true|false", description = "Compact using infixes.")
- @OptionalParameter(defaultValue = "true")
- Boolean getUsesInfixes();
+
+ @ParameterDescription(valueName = "sep", description = "The type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none]. Details are in Daciuk's paper and in the code. ")
+ @OptionalParameter(defaultValue = "prefix")
+ EncoderType getEncoderType();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
index 9da7e7d..741515e 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -17,10 +17,11 @@
package opennlp.morfologik.cmdline.builder;
+import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
+
import java.io.File;
import java.nio.charset.Charset;
-import morfologik.stemming.Dictionary;
import opennlp.morfologik.builder.MorfologikDictionayBuilder;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -54,18 +55,11 @@ public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
try {
builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
- params.getFSADictSeparator(), params.getUsesPrefixes(),
- params.getUsesInfixes());
+ params.getFSADictSeparator(), params.getEncoderType());
} catch (Exception e) {
throw new TerminateToolException(-1,
"Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
}
}
-
- private File getExpectedPropertiesFile(File dictFile) {
- return new File(Dictionary.getExpectedFeaturesName(dictFile
- .getAbsolutePath()));
- }
-
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index 9b74ae5..f022a86 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -17,6 +17,8 @@
package opennlp.morfologik.tagdict;
+import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
+
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
@@ -72,8 +74,8 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
// now we try to load it...
try {
this.dictData = Files.readAllBytes(Paths.get(path));
- this.dictInfo = Files.readAllBytes(Paths
- .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path)));
+ this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
+ .toPath());
this.dict = createMorfologikDictionary(dictData, dictInfo);
@@ -163,7 +165,7 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
throws IOException {
morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
- .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream(
+ .read(new ByteArrayInputStream(data), new ByteArrayInputStream(
info));
return new MorfologikTagDictionary(dict);
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
new file mode 100644
index 0000000..bd4d1a4
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.util;
+
+import java.io.File;
+
+import morfologik.stemming.DictionaryMetadata;
+
+public class MorfologikUtil {
+
+ public static File getExpectedPropertiesFile(File dictFile) {
+ return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath())
+ .toFile();
+ }
+
+ public static File getExpectedPropertiesFile(String dictFile) {
+ File f = new File(dictFile);
+ return getExpectedPropertiesFile(f);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 16d1dac..730025c 100644
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -25,6 +25,7 @@ import java.nio.charset.Charset;
import java.util.Properties;
import junit.framework.TestCase;
+import morfologik.stemming.EncoderType;
import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
import org.junit.Test;
@@ -40,8 +41,7 @@ public class POSDictionayBuilderTest extends TestCase {
File dictOutFile = File.createTempFile(
POSDictionayBuilderTest.class.getName(), ".dict");
- builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
- true);
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
.toURL());
@@ -54,40 +54,28 @@ public class POSDictionayBuilderTest extends TestCase {
Charset c = Charset.forName("iso-8859-1");
String sep = "_";
- boolean pref = true;
- boolean inf = true;
- Properties p = createPropertiesHelper(c, sep, pref, inf);
+
+ EncoderType encoderType = EncoderType.PREFIX;
+ Properties p = createPropertiesHelper(c, sep, encoderType);
assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
assertEquals(sep, p.getProperty("fsa.dict.separator"));
- assertEquals(pref,
- Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
- assertEquals(inf,
- Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
-
- pref = false;
- inf = true;
- p = createPropertiesHelper(c, sep, pref, inf);
- assertEquals(pref,
- Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
- assertEquals(inf,
- Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
-
- pref = true;
- inf = false;
- p = createPropertiesHelper(c, sep, pref, inf);
- assertEquals(pref,
- Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
- assertEquals(inf,
- Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+ assertEquals(encoderType,
+ EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+
+ encoderType = EncoderType.SUFFIX;
+ p = createPropertiesHelper(c, sep, encoderType);
+ assertEquals(encoderType,
+ EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+
}
private Properties createPropertiesHelper(Charset c, String sep,
- boolean pref, boolean inf) throws IOException {
+ EncoderType encoderType) throws IOException {
MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
".info");
- builder.createProperties(c, sep, pref, inf, f);
+ builder.createProperties(c, sep, encoderType, f);
InputStream is = new FileInputStream(f);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 6fd6ec1..87fc2cc 100644
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -5,6 +5,7 @@ import static org.junit.Assert.assertEquals;
import java.io.File;
import java.nio.charset.Charset;
+import morfologik.stemming.EncoderType;
import opennlp.morfologik.builder.MorfologikDictionayBuilder;
import opennlp.morfologik.builder.POSDictionayBuilderTest;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
@@ -34,8 +35,7 @@ public class MorfologikLemmatizerTest {
File dictOutFile = File.createTempFile(
POSDictionayBuilderTest.class.getName(), ".dict");
- builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
- true);
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
.toURL());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
index def97b6..d605e15 100644
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -9,6 +9,7 @@ import java.util.Arrays;
import java.util.List;
import morfologik.stemming.Dictionary;
+import morfologik.stemming.EncoderType;
import opennlp.morfologik.builder.MorfologikDictionayBuilder;
import opennlp.morfologik.builder.POSDictionayBuilderTest;
import opennlp.morfologik.tagdict.MorfologikTagDictionary;
@@ -80,8 +81,7 @@ public class MorfologikTagDictionaryTest {
File dictOutFile = File.createTempFile(
POSDictionayBuilderTest.class.getName(), ".dict");
- builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
- true);
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
MorfologikTagDictionary ml = new MorfologikTagDictionary(
Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
[09/16] opennlp git commit: OPENNLP-622 Included transitive
dependencies
Posted by co...@apache.org.
OPENNLP-622 Included transitive dependencies
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/be7e6bab
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/be7e6bab
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/be7e6bab
Branch: refs/heads/trunk
Commit: be7e6bab698d2a6fab35e254cf39970584208361
Parents: 6ada5de
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 16:27:40 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 16:27:40 2016 +0000
----------------------------------------------------------------------
src/main/assembly/bin.xml | 12 ++++++++++++
1 file changed, 12 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/be7e6bab/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml
index bbc1607..ab4f6da 100644
--- a/src/main/assembly/bin.xml
+++ b/src/main/assembly/bin.xml
@@ -24,11 +24,23 @@
<formats>
<format>tar.gz</format>
<format>zip</format>
+ <format>dir</format>
</formats>
<includeBaseDirectory>true</includeBaseDirectory>
<baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <scope>runtime</scope>
+ <unpack>false</unpack>
+ <useProjectArtifact>false</useProjectArtifact>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ <outputDirectory>lib</outputDirectory>
+ <useTransitiveDependencies>true</useTransitiveDependencies>
+ </dependencySet>
+ </dependencySets>
<fileSets>
<fileSet>
[12/16] opennlp git commit: OPENNLP-622 Preparing to migrate
morfologik-addon to main repository
Posted by co...@apache.org.
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
deleted file mode 100644
index 0a7ba48..0000000
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.builder;
-
-import java.io.File;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-
-import junit.framework.TestCase;
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
-
-import org.junit.Test;
-
-public class POSDictionayBuilderTest extends TestCase {
-
- @Test
- public void testBuildDictionary() throws Exception {
-
- Path output = createMorfologikDictionary();
-
- MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
-
- assertNotNull(ml);
- }
-
- public static Path createMorfologikDictionary() throws Exception {
- Path tabFilePath = File.createTempFile(
- POSDictionayBuilderTest.class.getName(), ".txt").toPath();
- Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
-
- Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
- "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
- Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
- "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
-
- MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-
- return builder.build(tabFilePath);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
deleted file mode 100644
index 6b7525e..0000000
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package opennlp.morfologik.lemmatizer;
-
-import static org.junit.Assert.assertEquals;
-
-import java.nio.file.Path;
-
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
-
-import org.junit.Test;
-
-public class MorfologikLemmatizerTest {
-
- @Test
- public void testLemmatizeInsensitive() throws Exception {
- DictionaryLemmatizer dict = createDictionary(false);
-
- assertEquals("casar", dict.lemmatize("casa", "V"));
- assertEquals("casa", dict.lemmatize("casa", "NOUN"));
-
- assertEquals("casa", dict.lemmatize("Casa", "PROP"));
-
- }
-
- private MorfologikLemmatizer createDictionary(boolean caseSensitive)
- throws Exception {
-
- Path output = POSDictionayBuilderTest.createMorfologikDictionary();
-
- MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
-
- return ml;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
deleted file mode 100644
index c6c9e04..0000000
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ /dev/null
@@ -1,78 +0,0 @@
-package opennlp.morfologik.tagdict;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.util.Arrays;
-import java.util.List;
-
-import morfologik.stemming.Dictionary;
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.postag.TagDictionary;
-
-import org.junit.Test;
-
-public class MorfologikTagDictionaryTest {
-
- @Test
- public void testNoLemma() throws Exception {
- MorfologikTagDictionary dict = createDictionary(false);
-
- List<String> tags = Arrays.asList(dict.getTags("carro"));
- assertEquals(1, tags.size());
- assertTrue(tags.contains("NOUN"));
-
- }
-
- @Test
- public void testPOSDictionaryInsensitive() throws Exception {
- TagDictionary dict = createDictionary(false);
-
- List<String> tags = Arrays.asList(dict.getTags("casa"));
- assertEquals(2, tags.size());
- assertTrue(tags.contains("NOUN"));
- assertTrue(tags.contains("V"));
-
- // this is the behavior of case insensitive dictionary
- // if we search it using case insensitive, Casa as a proper noun
- // should be lower case in the dictionary
- tags = Arrays.asList(dict.getTags("Casa"));
- assertEquals(2, tags.size());
- assertTrue(tags.contains("NOUN"));
- assertTrue(tags.contains("V"));
-
- }
-
- @Test
- public void testPOSDictionarySensitive() throws Exception {
- TagDictionary dict = createDictionary(true);
-
- List<String> tags = Arrays.asList(dict.getTags("casa"));
- assertEquals(2, tags.size());
- assertTrue(tags.contains("NOUN"));
- assertTrue(tags.contains("V"));
-
- // this is the behavior of case insensitive dictionary
- // if we search it using case insensitive, Casa as a proper noun
- // should be lower case in the dictionary
- tags = Arrays.asList(dict.getTags("Casa"));
- assertEquals(1, tags.size());
- assertTrue(tags.contains("PROP"));
-
- }
-
- private MorfologikTagDictionary createDictionary(boolean caseSensitive)
- throws Exception {
- return this.createDictionary(caseSensitive, null);
- }
-
- private MorfologikTagDictionary createDictionary(boolean caseSensitive,
- List<String> constant) throws Exception {
-
- Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
- MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
-
- return ml;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
deleted file mode 100644
index 7341a02..0000000
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.tagdict;
-
-import static org.junit.Assert.*;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.file.Path;
-
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.postag.POSModel;
-import opennlp.tools.postag.POSSample;
-import opennlp.tools.postag.POSTaggerFactory;
-import opennlp.tools.postag.POSTaggerME;
-import opennlp.tools.postag.TagDictionary;
-import opennlp.tools.postag.WordTagSampleStream;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.ModelType;
-
-import org.junit.Test;
-
-/**
- * Tests for the {@link POSTaggerFactory} class.
- */
-public class POSTaggerFactoryTest {
-
- private static ObjectStream<POSSample> createSampleStream()
- throws IOException {
- InputStream in = POSTaggerFactoryTest.class.getClassLoader()
- .getResourceAsStream("AnnotatedSentences.txt");
-
- return new WordTagSampleStream((new InputStreamReader(in)));
- }
-
- static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
- throws IOException {
- return POSTaggerME.train("en", createSampleStream(),
- TrainingParameters.defaultParams(), factory);
- }
-
- @Test
- public void testPOSTaggerWithCustomFactory() throws Exception {
-
- Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary();
- POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory();
- TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile());
- inFactory.setTagDictionary(inDict);
-
- POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory);
-
- POSTaggerFactory factory = posModel.getFactory();
- assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
-
- factory = null;
-
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- posModel.serialize(out);
- ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
-
- POSModel fromSerialized = new POSModel(in);
-
- factory = fromSerialized.getFactory();
- assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
-
- assertEquals(2, factory.getTagDictionary().getTags("casa").length);
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/AnnotatedSentences.txt b/src/test/resources/AnnotatedSentences.txt
deleted file mode 100644
index b40be87..0000000
--- a/src/test/resources/AnnotatedSentences.txt
+++ /dev/null
@@ -1,136 +0,0 @@
-Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
-I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
-So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
-She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
-I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
-
-Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
-Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
-I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
-As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
-The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
-But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
-It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
-She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
-
-Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
-About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
-That_DT sounds_VBZ good_JJ ._.
-So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
-I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
-Thank_VB you_PRP very_RB much_RB !_.
-
-Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
-I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
-Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
-Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
-
-My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
-We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
-Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
-As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
-But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
-Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
-
-I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
-I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
-I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
-I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
-
-Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
-As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
-The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
-Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
-
-An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
-Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
-She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
-But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
-So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
-It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
-She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
-
-Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
-They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
-Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
-No,_NNP that_DT 's_VBZ faked_VBN ._.
-But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
-I_PRP got_VBD them_PRP from_IN Hamburg._NNP
-Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
-It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
-I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
-Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
-Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
-I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
-His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
-
-Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
-My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
-How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
-His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
-I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
-I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
-Is_VBZ that_DT right_NN ?_.
-Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
-So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
-Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
-Bye_NNP !_.
-
-On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
-The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
-Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
-But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
-Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
-
-Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
-I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
-I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
-An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
-Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
-The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
-He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
-
-Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
-He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
-Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
-Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
-It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
-
-When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
-He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
-One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
-So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
-She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
-Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
-
-On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
-The_DT driver_NN got_VBD badly_RB injured_VBN ._.
-Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
-A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
-He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
-He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
-The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
-Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
-
-Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
-Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
-Yeah_UH for_IN sure_JJ ._.
-How_WRB did_VBD you_PRP know_VB that_DT ?_.
-I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
-Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
-Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
-Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
-But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
-I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
-The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
-I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
-Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
-
-My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
-When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
-My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
-But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
-So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
-Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
-We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
-I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.info b/src/test/resources/dictionaryWithLemma.info
deleted file mode 100644
index ad5fe8d..0000000
--- a/src/test/resources/dictionaryWithLemma.info
+++ /dev/null
@@ -1,15 +0,0 @@
-#
-# REQUIRED PROPERTIES
-#
-
-# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
-fsa.dict.separator=,
-
-# The charset in which the input is encoded. UTF-8 is strongly recommended.
-fsa.dict.encoding=UTF-8
-
-# The type of lemma-inflected form encoding compression that precedes automaton
-# construction. Allowed values: [suffix, infix, prefix, none].
-# Details are in Daciuk's paper and in the code.
-# Leave at 'prefix' if not sure.
-fsa.dict.encoder=prefix
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
deleted file mode 100644
index 09d39e3..0000000
--- a/src/test/resources/dictionaryWithLemma.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-casa,casa,NOUN
-casar,casa,V
-casar,casar,V-INF
-Casa,Casa,PROP
-casa,casinha,NOUN
-casa,casona,NOUN
-menino,menina,NOUN
-menino,menino,NOUN
-menino,menin�o,NOUN
-menino,menininho,NOUN
-carro,carro,NOUN
\ No newline at end of file
[08/16] opennlp git commit: OPENNLP-622 Fixed CLI launcher
Posted by co...@apache.org.
OPENNLP-622 Fixed CLI launcher
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6ada5de2
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6ada5de2
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6ada5de2
Branch: refs/heads/trunk
Commit: 6ada5de24aa39ce90733a477d8a947d2b3b60568
Parents: d1fab8c
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 16:26:43 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 16:26:43 2016 +0000
----------------------------------------------------------------------
src/main/bin/morfologik-addon | 2 +-
src/main/bin/morfologik-addon.bat | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6ada5de2/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon
index 70fb1d7..9b0faf9 100755
--- a/src/main/bin/morfologik-addon
+++ b/src/main/bin/morfologik-addon
@@ -32,4 +32,4 @@ fi
# Might fail if $0 is a link
OPENNLP_HOME=`dirname "$0"`/..
-$JAVACMD -Xmx1024m -jar $OPENNLP_HOME/lib/apache-opennlp-morfologik-addon-*.jar $@
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6ada5de2/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat
index a69fbd6..aeec31f 100644
--- a/src/main/bin/morfologik-addon.bat
+++ b/src/main/bin/morfologik-addon.bat
@@ -40,7 +40,7 @@ IF "%OPENNLP_HOME%" == "" (
)
REM # Get the library JAR file name (JIRA OPENNLP-554)
-FOR %%A IN ("%OPENNLP_HOME%\lib\apache-opennlp-morfologik-addon-*.jar") DO SET JAR_FILE=%%A
+FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A
%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
[06/16] opennlp git commit: OPENNLP-622 Added distribution assembly
files
Posted by co...@apache.org.
OPENNLP-622 Added distribution assembly files
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f588858a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f588858a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f588858a
Branch: refs/heads/trunk
Commit: f588858a45c8992330beb171f8da079a0820961b
Parents: 3ceb554
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 03:53:06 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 03:53:06 2016 +0000
----------------------------------------------------------------------
bin/morfologik-addon | 20 +++
bin/morfologik-addon.bat | 21 +++
pom.xml | 150 ++++++++++++++-------
src/main/assembly/bin.xml | 79 +++++++++++
src/main/assembly/src.xml | 39 ++++++
src/main/bin/morfologik-addon | 35 +++++
src/main/bin/morfologik-addon.bat | 47 +++++++
src/main/readme/LICENSE | 230 +++++++++++++++++++++++++++++++++
src/main/readme/NOTICE | 11 ++
9 files changed, 583 insertions(+), 49 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon b/bin/morfologik-addon
new file mode 100755
index 0000000..ccc635e
--- /dev/null
+++ b/bin/morfologik-addon
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*"
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon.bat b/bin/morfologik-addon.bat
new file mode 100644
index 0000000..26a4778
--- /dev/null
+++ b/bin/morfologik-addon.bat
@@ -0,0 +1,21 @@
+@ECHO OFF
+
+REM # Licensed to the Apache Software Foundation (ASF) under one
+REM # or more contributor license agreements. See the NOTICE file
+REM # distributed with this work for additional information
+REM # regarding copyright ownership. The ASF licenses this file
+REM # to you under the Apache License, Version 2.0 (the
+REM # "License"); you may not use this file except in compliance
+REM # with the License. You may obtain a copy of the License at
+REM #
+REM # http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM # Unless required by applicable law or agreed to in writing,
+REM # software distributed under the License is distributed on an
+REM #
+REM # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM # KIND, either express or implied. See the License for the
+REM # specific language governing permissions and limitations
+REM # under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*"
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 60f201e..56d0e47 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,57 +1,109 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
- <groupId>org.apache.opennlp</groupId>
- <artifactId>morfologik-addon</artifactId>
- <version>1.0-SNAPSHOT</version>
- <packaging>jar</packaging>
- <name>Morfologik Addon</name>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>morfologik-addon</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>Morfologik Addon</name>
- <url>http://maven.apache.org</url>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>2.3.2</version>
- <configuration>
- <source>1.7</source>
- <target>1.7</target>
- </configuration>
- </plugin>
- </plugins>
- </build>
- <properties>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- </properties>
+ <url>http://maven.apache.org</url>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>bundle-project-sources</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptors>
+ <descriptor>src/main/assembly/bin.xml</descriptor>
+ <descriptor>src/main/assembly/src.xml</descriptor>
+ </descriptors>
+ <!-- Tar package is only compatible with gnu tar,
+ many file have more than 100 chars.
+ Right now only javadoc files are too long.
+ -->
+ <tarLongFileMode>gnu</tarLongFileMode>
+
+ <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-antrun-plugin</artifactId>
+ <version>1.6</version>
+ <executions>
+ <execution>
+ <id>generate checksums for binary artifacts</id>
+ <goals><goal>run</goal></goals>
+ <phase>verify</phase>
+ <configuration>
+ <target>
+ <checksum algorithm="sha1" format="MD5SUM">
+ <fileset dir="${project.build.directory}">
+ <include name="*.zip" />
+ <include name="*.gz" />
+ </fileset>
+ </checksum>
+ <checksum algorithm="md5" format="MD5SUM">
+ <fileset dir="${project.build.directory}">
+ <include name="*.zip" />
+ <include name="*.gz" />
+ </fileset>
+ </checksum>
+ </target>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
- <dependencies>
- <dependency>
- <groupId>org.carrot2</groupId>
- <artifactId>morfologik-stemming</artifactId>
- <version>2.1.0</version>
- <scope>compile</scope>
- </dependency>
- <dependency>
- <groupId>org.carrot2</groupId>
- <artifactId>morfologik-tools</artifactId>
- <version>2.1.0</version>
- <scope>compile</scope>
- </dependency>
+ <dependencies>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-stemming</artifactId>
+ <version>2.1.0</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-tools</artifactId>
+ <version>2.1.0</version>
+ <scope>compile</scope>
+ </dependency>
- <dependency>
- <groupId>org.apache.opennlp</groupId>
- <artifactId>opennlp-tools</artifactId>
- <version>1.6.0</version>
- </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0</version>
+ </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.8.1</version>
- <scope>test</scope>
- </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
- </dependencies>
+ </dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml
new file mode 100644
index 0000000..bbc1607
--- /dev/null
+++ b/src/main/assembly/bin.xml
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<assembly>
+ <id>bin</id>
+ <formats>
+ <format>tar.gz</format>
+ <format>zip</format>
+ </formats>
+
+ <includeBaseDirectory>true</includeBaseDirectory>
+ <baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
+
+
+ <fileSets>
+ <fileSet>
+ <directory>src/main/readme</directory>
+ <outputDirectory></outputDirectory>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ </fileSet>
+
+ <fileSet>
+ <directory>.</directory>
+ <outputDirectory></outputDirectory>
+ <filtered>true</filtered>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ <includes>
+ <include>README</include>
+ <include>RELEASE_NOTES.html</include>
+ </includes>
+ </fileSet>
+
+ <fileSet>
+ <directory>target</directory>
+ <outputDirectory></outputDirectory>
+ <fileMode>644</fileMode>
+ <directoryMode>755</directoryMode>
+ <includes>
+ <include>issuesFixed/**</include>
+ </includes>
+ </fileSet>
+
+ <fileSet>
+ <directory>src/main/bin</directory>
+ <fileMode>755</fileMode>
+ <directoryMode>755</directoryMode>
+ <outputDirectory>bin</outputDirectory>
+ </fileSet>
+
+ <fileSet>
+ <directory>target</directory>
+ <outputDirectory>lib</outputDirectory>
+ <includes>
+ <include>morfologik-addon-*.jar</include>
+ </includes>
+ </fileSet>
+
+ </fileSets>
+</assembly>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/src.xml b/src/main/assembly/src.xml
new file mode 100644
index 0000000..cdcc9d3
--- /dev/null
+++ b/src/main/assembly/src.xml
@@ -0,0 +1,39 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<assembly>
+ <id>src</id>
+ <formats>
+ <format>tar.gz</format>
+ <format>zip</format>
+ </formats>
+
+ <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
+
+ <fileSets>
+ <fileSet>
+ <directory>../</directory>
+ <outputDirectory></outputDirectory>
+ <excludes>
+ <exclude>**/target/**</exclude>
+ <exclude>**/.*/**</exclude>
+ <exclude>**/pom.xml.releaseBackup</exclude>
+ <exclude>**/release.properties</exclude>
+ </excludes>
+ </fileSet>
+ </fileSets>
+</assembly>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon
new file mode 100755
index 0000000..70fb1d7
--- /dev/null
+++ b/src/main/bin/morfologik-addon
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note: Do not output anything in this script file, any output
+# may be inadvertantly placed in any output files if
+# output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ JAVACMD="$JAVA_HOME/bin/java"
+ else
+ JAVACMD="`which java`"
+ fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -jar $OPENNLP_HOME/lib/apache-opennlp-morfologik-addon-*.jar $@
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat
new file mode 100644
index 0000000..a69fbd6
--- /dev/null
+++ b/src/main/bin/morfologik-addon.bat
@@ -0,0 +1,47 @@
+@ECHO off
+
+REM # Licensed to the Apache Software Foundation (ASF) under one
+REM # or more contributor license agreements. See the NOTICE file
+REM # distributed with this work for additional information
+REM # regarding copyright ownership. The ASF licenses this file
+REM # to you under the Apache License, Version 2.0 (the
+REM # "License"); you may not use this file except in compliance
+REM # with the License. You may obtain a copy of the License at
+REM #
+REM # http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM # Unless required by applicable law or agreed to in writing,
+REM # software distributed under the License is distributed on an
+REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM # KIND, either express or implied. See the License for the
+REM # specific language governing permissions and limitations
+REM # under the License.
+
+REM # Note: Do not output anything in this script file, any output
+REM # may be inadvertantly placed in any output files if
+REM # output redirection is used.
+SETLOCAL
+
+IF "%JAVA_CMD%" == "" (
+ IF "%JAVA_HOME%" == "" (
+ SET JAVA_CMD=java
+ ) ELSE (
+ REM # Keep JAVA_HOME to short-name without spaces
+ FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
+ )
+)
+
+REM # Should work with Windows XP and greater. If not, specify the path to where it is installed.
+IF "%OPENNLP_HOME%" == "" (
+ SET OPENNLP_HOME=%~sp0..
+) ELSE (
+ REM # Keep OPENNLP_HOME to short-name without spaces
+ FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
+)
+
+REM # Get the library JAR file name (JIRA OPENNLP-554)
+FOR %%A IN ("%OPENNLP_HOME%\lib\apache-opennlp-morfologik-addon-*.jar") DO SET JAR_FILE=%%A
+
+%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
+
+ENDLOCAL
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/LICENSE b/src/main/readme/LICENSE
new file mode 100644
index 0000000..576b4cf
--- /dev/null
+++ b/src/main/readme/LICENSE
@@ -0,0 +1,230 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+The following license applies to the Snowball stemmers:
+
+ Copyright (c) 2001, Dr Martin Porter
+ Copyright (c) 2002, Richard Boulton
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holders nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/src/main/readme/NOTICE b/src/main/readme/NOTICE
new file mode 100644
index 0000000..73fb1d7
--- /dev/null
+++ b/src/main/readme/NOTICE
@@ -0,0 +1,11 @@
+Apache OpenNLP
+Copyright 2010, 2013 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+The snowball stemmers in
+opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
+were developed by Martin Porter and Richard Boulton.
+The full snowball package is available from
+http://snowball.tartarus.org/
[11/16] opennlp git commit: OPENNLP-622 Added Morfologik license
Posted by co...@apache.org.
OPENNLP-622 Added Morfologik license
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/0cced84d
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/0cced84d
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/0cced84d
Branch: refs/heads/trunk
Commit: 0cced84d1e364959616235f87742e28353e81779
Parents: 60a3b24
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 22:09:05 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 22:09:05 2016 +0000
----------------------------------------------------------------------
src/main/readme/MORFOLOGIK-LICENSE | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/0cced84d/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/MORFOLOGIK-LICENSE b/src/main/readme/MORFOLOGIK-LICENSE
new file mode 100644
index 0000000..0554010
--- /dev/null
+++ b/src/main/readme/MORFOLOGIK-LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of Morfologik nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
[07/16] opennlp git commit: OPENNLP-622 Fixed issues related to
command line.
Posted by co...@apache.org.
OPENNLP-622 Fixed issues related to command line.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d1fab8cd
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d1fab8cd
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d1fab8cd
Branch: refs/heads/trunk
Commit: d1fab8cd4215ddf65ce98ef6aae2bc06720be742
Parents: f588858
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 19:18:54 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 19:18:54 2016 +0000
----------------------------------------------------------------------
.../builder/XMLDictionaryToTableParams.java | 11 ++++-
.../builder/XMLDictionaryToTableTool.java | 51 ++++++++++++++++++--
.../tagdict/MorfologikPOSTaggerFactory.java | 26 ----------
.../tagdict/POSTaggerFactoryTest.java | 6 ++-
4 files changed, 63 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
index b88cc5d..4ee8cd4 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder;
import java.io.File;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.params.EncodingParameter;
@@ -30,7 +31,15 @@ interface XMLDictionaryToTableParams extends EncodingParameter {
@ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
File getInputFile();
- @ParameterDescription(valueName = "out", description = "Tab separated format.")
+ @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
File getOutputFile();
+ @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
+ @OptionalParameter(defaultValue=",")
+ String getSeparator();
+
+ @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
+ @OptionalParameter(defaultValue="prefix")
+ String getEncoder();
+
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
index c87f016..0e7f2d5 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -23,8 +23,11 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.Iterator;
+import java.util.Properties;
+import morfologik.stemming.DictionaryMetadata;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
@@ -35,6 +38,8 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
interface Params extends XMLDictionaryToTableParams {
}
+ private String SEPARATOR;
+
public String getShortDescription() {
return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
}
@@ -49,6 +54,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
File dictInFile = params.getInputFile();
File dictOutFile = params.getOutputFile();
Charset encoding = params.getEncoding();
+ SEPARATOR = params.getSeparator();
CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
@@ -66,17 +72,56 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
encoding)) {
while (iterator.hasNext()) {
String word = iterator.next();
- String wordAndLemma = word + "\t\t"; // lemma is empty
for (String tag : tagDictionary.getTags(word)) {
- writer.write(wordAndLemma + tag);
- writer.newLine();
+ if(valid(word,tag)) {
+ String entry = createEntry(word, tag);
+ writer.write(entry);
+ writer.newLine();
+ }
}
}
writer.close();
+ System.out.println("Created dictionary: " + dictOutFile.toPath());
} catch (IOException e) {
throw new TerminateToolException(-1, "Error while writing output: "
+ e.getMessage(), e);
}
+
+ Properties info = new Properties();
+ info.setProperty("fsa.dict.separator", SEPARATOR);
+ info.setProperty("fsa.dict.encoding", params.getEncoding().name());
+ info.setProperty("fsa.dict.encoder", params.getEncoder());
+
+ Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
+
+ try {
+ info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Error while writing metadata output: "
+ + e.getMessage(), e);
+ }
+ System.out.println("Created metadata: " + dictOutFile.toPath());
+
+ }
+
+ private boolean valid(String word, String tag) {
+ if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
+ System.out
+ .println("Warn: invalid entry because contains separator - word: "
+ + word + " tag: " + tag);
+ return false;
+ }
+
+ return true;
+ }
+
+ private String createEntry(String word, String tag) {
+
+ String entry = "" + SEPARATOR +// base
+ word + SEPARATOR +
+ tag;
+
+ return entry;
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index dcb6554..93d6c61 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -17,8 +17,6 @@
package opennlp.morfologik.tagdict;
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
@@ -27,7 +25,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.Map;
import morfologik.stemming.DictionaryMetadata;
@@ -81,29 +78,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
super.init(ngramDictionary, null);
this.dict = posDictionary;
-
- // get the dictionary path
- String path = System.getProperty("morfologik.dict");
- if (path == null) {
- throw new IllegalArgumentException(
- "The property fsa.dict is missing! -Dmorfologik.dict=path");
- }
-
- // now we try to load it...
- try {
- this.dictData = Files.readAllBytes(Paths.get(path));
- this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
- .toPath());
-
- this.dict = createMorfologikDictionary(dictData, dictInfo);
-
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(
- "The file is not a Morfologik dictionary!", e);
- } catch (IOException e) {
- throw new IllegalArgumentException(
- "Could not open the Morfologik dictionary or the .info file", e);
- }
}
@Override
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 9233979..7341a02 100644
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -17,7 +17,7 @@
package opennlp.morfologik.tagdict;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -71,6 +71,8 @@ public class POSTaggerFactoryTest {
POSTaggerFactory factory = posModel.getFactory();
assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+ factory = null;
+
ByteArrayOutputStream out = new ByteArrayOutputStream();
posModel.serialize(out);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
@@ -79,6 +81,8 @@ public class POSTaggerFactoryTest {
factory = fromSerialized.getFactory();
assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+ assertEquals(2, factory.getTagDictionary().getTags("casa").length);
}
}
\ No newline at end of file
[10/16] opennlp git commit: OPENNLP-622 Added a different OpenNLP CLI
loader that includes all jars in lib folder to classpath.
Posted by co...@apache.org.
OPENNLP-622 Added a different OpenNLP CLI loader that includes all jars in lib folder to classpath.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60a3b24f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60a3b24f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60a3b24f
Branch: refs/heads/trunk
Commit: 60a3b24f186cc12ee9f053d3530055933bb2a3d9
Parents: be7e6ba
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 21:36:48 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 21:36:48 2016 +0000
----------------------------------------------------------------------
src/main/bin/opennlp-cp | 35 +++++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/60a3b24f/src/main/bin/opennlp-cp
----------------------------------------------------------------------
diff --git a/src/main/bin/opennlp-cp b/src/main/bin/opennlp-cp
new file mode 100755
index 0000000..dff0d12
--- /dev/null
+++ b/src/main/bin/opennlp-cp
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note: Do not output anything in this script file, any output
+# may be inadvertantly placed in any output files if
+# output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ JAVACMD="$JAVA_HOME/bin/java"
+ else
+ JAVACMD="`which java`"
+ fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@
[05/16] opennlp git commit: OPENNLP-622 Fixed PosTaggerFactory and
restored test.
Posted by co...@apache.org.
OPENNLP-622 Fixed PosTaggerFactory and restored test.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ceb5540
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ceb5540
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ceb5540
Branch: refs/heads/trunk
Commit: 3ceb5540ced842875c010bb81169afcb544f203e
Parents: 1314887
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 03:52:14 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 03:52:14 2016 +0000
----------------------------------------------------------------------
.../tagdict/MorfologikPOSTaggerFactory.java | 46 +++--
.../tagdict/POSTaggerFactoryTest.java | 192 ++++++++-----------
2 files changed, 106 insertions(+), 132 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ceb5540/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index 723b1ce..dcb6554 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -26,9 +26,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
+import morfologik.stemming.DictionaryMetadata;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.postag.TagDictionary;
@@ -53,23 +55,27 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
public MorfologikPOSTaggerFactory() {
}
-
- /**
- * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}.
- *
- * @param ngramDictionary a ngramDictionary
- * @param morfologikDictionary a Morfologik dictionary
- * @param morfologikDictionaryMetadata the dictionary metadata
- * @throws IOException invalid Morfologik dictionary
- */
- public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
- byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException {
- super(ngramDictionary, null);
- this.dictData = morfologikDictionary;
- this.dictInfo = morfologikDictionaryMetadata;
+
+ public TagDictionary createTagDictionary(File dictionary)
+ throws InvalidFormatException, FileNotFoundException, IOException {
+
+ if(!dictionary.canRead()) {
+ throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
+ }
+
+ Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
+
+ if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
+ throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
+ }
+
+ this.dictData = Files.readAllBytes(dictionary.toPath());
+ this.dictInfo = Files.readAllBytes(dictionaryMeta);
+
+ return createMorfologikDictionary(dictData, dictInfo);
- this.dict = createMorfologikDictionary(dictData, dictInfo);
}
+
@Override
protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
@@ -130,8 +136,7 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
@Override
public void setTagDictionary(TagDictionary dictionary) {
- throw new UnsupportedOperationException(
- "Morfologik POS Tagger factory does not support this operation");
+ this.dict = dictionary;
}
@Override
@@ -141,13 +146,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
}
@Override
- public TagDictionary createTagDictionary(File dictionary)
- throws InvalidFormatException, FileNotFoundException, IOException {
- throw new UnsupportedOperationException(
- "Morfologik POS Tagger factory does not support this operation");
- }
-
- @Override
public TagDictionary createTagDictionary(InputStream in)
throws InvalidFormatException, IOException {
throw new UnsupportedOperationException(
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ceb5540/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 6c6814b..9233979 100644
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -1,108 +1,84 @@
-///*
-// * Licensed to the Apache Software Foundation (ASF) under one or more
-// * contributor license agreements. See the NOTICE file distributed with
-// * this work for additional information regarding copyright ownership.
-// * The ASF licenses this file to You under the Apache License, Version 2.0
-// * (the "License"); you may not use this file except in compliance with
-// * the License. You may obtain a copy of the License at
-// *
-// * http://www.apache.org/licenses/LICENSE-2.0
-// *
-// * Unless required by applicable law or agreed to in writing, software
-// * distributed under the License is distributed on an "AS IS" BASIS,
-// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// * See the License for the specific language governing permissions and
-// * limitations under the License.
-// */
-//
-//package opennlp.morfologik.tagdict;
-//
-//import static org.junit.Assert.assertTrue;
-//
-//import java.io.ByteArrayInputStream;
-//import java.io.ByteArrayOutputStream;
-//import java.io.File;
-//import java.io.IOException;
-//import java.io.InputStream;
-//import java.io.InputStreamReader;
-//import java.nio.charset.Charset;
-//import java.nio.file.Files;
-//import java.nio.file.Path;
-//import java.nio.file.Paths;
-//
-//import morfologik.stemming.DictionaryMetadata;
-//import morfologik.stemming.EncoderType;
-//import opennlp.morfologik.builder.MorfologikDictionayBuilder;
-//import opennlp.morfologik.builder.POSDictionayBuilderTest;
-//import opennlp.tools.dictionary.Dictionary;
-//import opennlp.tools.postag.DefaultPOSSequenceValidator;
-//import opennlp.tools.postag.POSContextGenerator;
-//import opennlp.tools.postag.POSDictionary;
-//import opennlp.tools.postag.POSModel;
-//import opennlp.tools.postag.POSSample;
-//import opennlp.tools.postag.POSTaggerFactory;
-//import opennlp.tools.postag.POSTaggerME;
-//import opennlp.tools.postag.WordTagSampleStream;
-//import opennlp.tools.util.BaseToolFactory;
-//import opennlp.tools.util.InvalidFormatException;
-//import opennlp.tools.util.ObjectStream;
-//import opennlp.tools.util.TrainingParameters;
-//import opennlp.tools.util.model.ModelType;
-//
-//import org.junit.Test;
-//
-///**
-// * Tests for the {@link POSTaggerFactory} class.
-// */
-//public class POSTaggerFactoryTest {
-//
-// private static ObjectStream<POSSample> createSampleStream()
-// throws IOException {
-// InputStream in = POSTaggerFactoryTest.class.getClassLoader()
-// .getResourceAsStream("AnnotatedSentences.txt");
-//
-// return new WordTagSampleStream((new InputStreamReader(in)));
-// }
-//
-// static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
-// throws IOException {
-// return POSTaggerME.train("en", createSampleStream(),
-// TrainingParameters.defaultParams(), factory);
-// }
-//
-// @Test
-// public void testPOSTaggerWithCustomFactory() throws Exception {
-//
-// MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-// File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-// "/dictionaryWithLemma.txt").getFile());
-//
-// File dictOutFile = File.createTempFile(
-// POSDictionayBuilderTest.class.getName(), ".dict");
-//
-// builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+",
-// EncoderType.PREFIX);
-//
-// Path dictPath = dictOutFile.toPath();
-// Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath);
-//
-// byte[] dic = Files.readAllBytes(dictPath);
-// byte[] meta = Files.readAllBytes(metaPath);
-//
-// POSModel posModel = trainPOSModel(ModelType.MAXENT,
-// new MorfologikPOSTaggerFactory(null, dic, meta));
-//
-// POSTaggerFactory factory = posModel.getFactory();
-// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
-//
-// ByteArrayOutputStream out = new ByteArrayOutputStream();
-// posModel.serialize(out);
-// ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
-//
-// POSModel fromSerialized = new POSModel(in);
-//
-// factory = fromSerialized.getFactory();
-// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
-// }
-//
-//}
\ No newline at end of file
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.postag.WordTagSampleStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelType;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link POSTaggerFactory} class.
+ */
+public class POSTaggerFactoryTest {
+
+ private static ObjectStream<POSSample> createSampleStream()
+ throws IOException {
+ InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+ .getResourceAsStream("AnnotatedSentences.txt");
+
+ return new WordTagSampleStream((new InputStreamReader(in)));
+ }
+
+ static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+ throws IOException {
+ return POSTaggerME.train("en", createSampleStream(),
+ TrainingParameters.defaultParams(), factory);
+ }
+
+ @Test
+ public void testPOSTaggerWithCustomFactory() throws Exception {
+
+ Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary();
+ POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory();
+ TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile());
+ inFactory.setTagDictionary(inDict);
+
+ POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory);
+
+ POSTaggerFactory factory = posModel.getFactory();
+ assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ posModel.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ POSModel fromSerialized = new POSModel(in);
+
+ factory = fromSerialized.getFactory();
+ assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+ }
+
+}
\ No newline at end of file