You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/11/14 22:24:13 UTC
svn commit: r1542089 - in /opennlp/addons: ./ morfologik-addon/
morfologik-addon/src/ morfologik-addon/src/main/
morfologik-addon/src/main/java/ morfologik-addon/src/main/java/opennlp/
morfologik-addon/src/main/java/opennlp/morfologik/ morfologik-addon...
Author: joern
Date: Thu Nov 14 21:24:13 2013
New Revision: 1542089
URL: http://svn.apache.org/r1542089
Log:
OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a patch.
Added:
opennlp/addons/
opennlp/addons/morfologik-addon/
opennlp/addons/morfologik-addon/pom.xml
opennlp/addons/morfologik-addon/src/
opennlp/addons/morfologik-addon/src/main/
opennlp/addons/morfologik-addon/src/main/java/
opennlp/addons/morfologik-addon/src/main/java/opennlp/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
Added: opennlp/addons/morfologik-addon/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/pom.xml?rev=1542089&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/pom.xml (added)
+++ opennlp/addons/morfologik-addon/pom.xml Thu Nov 14 21:24:13 2013
@@ -0,0 +1,50 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>morfologik-addon</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>Morfologik Addon</name>
+
+ <url>http://maven.apache.org</url>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-stemming</artifactId>
+ <version>1.6.0</version>
+ <scope>compile</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java?rev=1542089&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java Thu Nov 14 21:24:13 2013
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+ private IStemmer dictLookup;
+ public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+ "NNP", "NP00000"));
+
+ public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+ IOException {
+ dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+ }
+
+ private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+ List<WordData> wdList = dictLookup.lookup(word);
+ HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+ for (WordData wd : wdList) {
+ List<String> wordLemmaTags = new ArrayList<String>();
+ wordLemmaTags.add(word);
+ wordLemmaTags.add(wd.getTag().toString());
+ dictMap.put(wordLemmaTags, wd.getStem().toString());
+ }
+ return dictMap;
+ }
+
+ private List<String> getDictKeys(String word, String postag) {
+ List<String> keys = new ArrayList<String>();
+ if (constantTags.contains(postag)) {
+ keys.addAll(Arrays.asList(word, postag));
+ } else {
+ keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+ }
+ return keys;
+ }
+
+ private HashMap<List<String>, String> getDictMap(String word, String postag) {
+ HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+
+ if (constantTags.contains(postag)) {
+ dictMap = this.getLemmaTagsDict(word);
+ } else {
+ dictMap = this.getLemmaTagsDict(word.toLowerCase());
+ }
+ return dictMap;
+ }
+
+ public String lemmatize(String word, String postag) {
+ String lemma = null;
+ List<String> keys = this.getDictKeys(word, postag);
+ HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+ // lookup lemma as value of the map
+ String keyValue = dictMap.get(keys);
+ if (keyValue != null) {
+ lemma = keyValue;
+ } else if (keyValue == null && constantTags.contains(postag)) {
+ lemma = word;
+ } else if (keyValue == null && word.toUpperCase() == word) {
+ lemma = word;
+ } else {
+ lemma = word.toLowerCase();
+ }
+ return lemma;
+ }
+}