You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/11/14 22:24:13 UTC

svn commit: r1542089 - in /opennlp/addons: ./ morfologik-addon/ morfologik-addon/src/ morfologik-addon/src/main/ morfologik-addon/src/main/java/ morfologik-addon/src/main/java/opennlp/ morfologik-addon/src/main/java/opennlp/morfologik/ morfologik-addon...

Author: joern
Date: Thu Nov 14 21:24:13 2013
New Revision: 1542089

URL: http://svn.apache.org/r1542089
Log:
OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a patch.

Added:
    opennlp/addons/
    opennlp/addons/morfologik-addon/
    opennlp/addons/morfologik-addon/pom.xml
    opennlp/addons/morfologik-addon/src/
    opennlp/addons/morfologik-addon/src/main/
    opennlp/addons/morfologik-addon/src/main/java/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java

Added: opennlp/addons/morfologik-addon/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/pom.xml?rev=1542089&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/pom.xml (added)
+++ opennlp/addons/morfologik-addon/pom.xml Thu Nov 14 21:24:13 2013
@@ -0,0 +1,50 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.opennlp</groupId>
+  <artifactId>morfologik-addon</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+  <name>Morfologik Addon</name>
+
+  <url>http://maven.apache.org</url>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.3.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+    <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+   <dependency>
+      <groupId>org.carrot2</groupId>
+      <artifactId>morfologik-stemming</artifactId>
+      <version>1.6.0</version>
+      <scope>compile</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-tools</artifactId>
+      <version>1.6.0-SNAPSHOT</version>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>3.8.1</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java?rev=1542089&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java Thu Nov 14 21:24:13 2013
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+  private IStemmer dictLookup;
+  public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+      "NNP", "NP00000"));
+
+  public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+      IOException {
+    dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+  }
+
+  private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+    List<WordData> wdList = dictLookup.lookup(word);
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+    for (WordData wd : wdList) {
+      List<String> wordLemmaTags = new ArrayList<String>();
+      wordLemmaTags.add(word);
+      wordLemmaTags.add(wd.getTag().toString());
+      dictMap.put(wordLemmaTags, wd.getStem().toString());
+    }
+    return dictMap;
+  }
+
+  private List<String> getDictKeys(String word, String postag) {
+    List<String> keys = new ArrayList<String>();
+    if (constantTags.contains(postag)) {
+      keys.addAll(Arrays.asList(word, postag));
+    } else {
+      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+    }
+    return keys;
+  }
+
+  private HashMap<List<String>, String> getDictMap(String word, String postag) {
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+
+    if (constantTags.contains(postag)) {
+      dictMap = this.getLemmaTagsDict(word);
+    } else {
+      dictMap = this.getLemmaTagsDict(word.toLowerCase());
+    }
+    return dictMap;
+  }
+
+  public String lemmatize(String word, String postag) {
+    String lemma = null;
+    List<String> keys = this.getDictKeys(word, postag);
+    HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+    // lookup lemma as value of the map
+    String keyValue = dictMap.get(keys);
+    if (keyValue != null) {
+      lemma = keyValue;
+    } else if (keyValue == null && constantTags.contains(postag)) {
+      lemma = word;
+    } else if (keyValue == null && word.toUpperCase() == word) {
+      lemma = word;
+    } else {
+      lemma = word.toLowerCase();
+    }
+    return lemma;
+  }
+}