You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/03 10:16:09 UTC
opennlp git commit: OPENNLP-1048: Add stemmer for Irish
Repository: opennlp
Updated Branches:
refs/heads/master caeaaeea6 -> 6c2dbf288
OPENNLP-1048: Add stemmer for Irish
Closes #189
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6c2dbf28
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6c2dbf28
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6c2dbf28
Branch: refs/heads/master
Commit: 6c2dbf2885fb4602b8e42bd208ebef66df23329b
Parents: caeaaee
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sat Apr 29 00:15:29 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 12:15:27 2017 +0200
----------------------------------------------------------------------
.../tools/stemmer/snowball/SnowballStemmer.java | 4 +
.../tools/stemmer/snowball/irishStemmer.java | 616 +++++++++++++++++++
.../tools/stemmer/SnowballStemmerTest.java | 9 +
3 files changed, 629 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
index dd75754..86ebe84 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
@@ -29,6 +29,7 @@ public class SnowballStemmer implements Stemmer {
FRENCH,
GERMAN,
HUNGARIAN,
+ IRISH,
ITALIAN,
NORWEGIAN,
PORTER,
@@ -67,6 +68,9 @@ public class SnowballStemmer implements Stemmer {
else if (ALGORITHM.HUNGARIAN.equals(algorithm)) {
stemmer = new hungarianStemmer();
}
+ else if (ALGORITHM.IRISH.equals(algorithm)) {
+ stemmer = new irishStemmer();
+ }
else if (ALGORITHM.ITALIAN.equals(algorithm)) {
stemmer = new italianStemmer();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
new file mode 100644
index 0000000..316288f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
@@ -0,0 +1,616 @@
+// CHECKSTYLE:OFF
+/*
+
+Copyright (c) 2001, Dr Martin Porter
+Copyright (c) 2002, Richard Boulton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holders nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+// This file was generated automatically by the Snowball to Java compiler
+
+package opennlp.tools.stemmer.snowball;
+
+ /**
+ * This class was automatically generated by a Snowball to Java compiler
+ * It implements the stemming algorithm defined by a snowball script.
+ */
+
+public class irishStemmer extends opennlp.tools.stemmer.snowball.AbstractSnowballStemmer {
+
+private static final long serialVersionUID = 1L;
+
+ private final static irishStemmer methodObject = new irishStemmer ();
+
+ private final static Among a_0[] = {
+ new Among ( "b'", -1, 4, "", methodObject ),
+ new Among ( "bh", -1, 14, "", methodObject ),
+ new Among ( "bhf", 1, 9, "", methodObject ),
+ new Among ( "bp", -1, 11, "", methodObject ),
+ new Among ( "ch", -1, 15, "", methodObject ),
+ new Among ( "d'", -1, 2, "", methodObject ),
+ new Among ( "d'fh", 5, 3, "", methodObject ),
+ new Among ( "dh", -1, 16, "", methodObject ),
+ new Among ( "dt", -1, 13, "", methodObject ),
+ new Among ( "fh", -1, 17, "", methodObject ),
+ new Among ( "gc", -1, 7, "", methodObject ),
+ new Among ( "gh", -1, 18, "", methodObject ),
+ new Among ( "h-", -1, 1, "", methodObject ),
+ new Among ( "m'", -1, 4, "", methodObject ),
+ new Among ( "mb", -1, 6, "", methodObject ),
+ new Among ( "mh", -1, 19, "", methodObject ),
+ new Among ( "n-", -1, 1, "", methodObject ),
+ new Among ( "nd", -1, 8, "", methodObject ),
+ new Among ( "ng", -1, 10, "", methodObject ),
+ new Among ( "ph", -1, 20, "", methodObject ),
+ new Among ( "sh", -1, 5, "", methodObject ),
+ new Among ( "t-", -1, 1, "", methodObject ),
+ new Among ( "th", -1, 21, "", methodObject ),
+ new Among ( "ts", -1, 12, "", methodObject )
+ };
+
+ private final static Among a_1[] = {
+ new Among ( "\u00EDochta", -1, 1, "", methodObject ),
+ new Among ( "a\u00EDochta", 0, 1, "", methodObject ),
+ new Among ( "ire", -1, 2, "", methodObject ),
+ new Among ( "aire", 2, 2, "", methodObject ),
+ new Among ( "abh", -1, 1, "", methodObject ),
+ new Among ( "eabh", 4, 1, "", methodObject ),
+ new Among ( "ibh", -1, 1, "", methodObject ),
+ new Among ( "aibh", 6, 1, "", methodObject ),
+ new Among ( "amh", -1, 1, "", methodObject ),
+ new Among ( "eamh", 8, 1, "", methodObject ),
+ new Among ( "imh", -1, 1, "", methodObject ),
+ new Among ( "aimh", 10, 1, "", methodObject ),
+ new Among ( "\u00EDocht", -1, 1, "", methodObject ),
+ new Among ( "a\u00EDocht", 12, 1, "", methodObject ),
+ new Among ( "ir\u00ED", -1, 2, "", methodObject ),
+ new Among ( "air\u00ED", 14, 2, "", methodObject )
+ };
+
+ private final static Among a_2[] = {
+ new Among ( "\u00F3ideacha", -1, 6, "", methodObject ),
+ new Among ( "patacha", -1, 5, "", methodObject ),
+ new Among ( "achta", -1, 1, "", methodObject ),
+ new Among ( "arcachta", 2, 2, "", methodObject ),
+ new Among ( "eachta", 2, 1, "", methodObject ),
+ new Among ( "grafa\u00EDochta", -1, 4, "", methodObject ),
+ new Among ( "paite", -1, 5, "", methodObject ),
+ new Among ( "ach", -1, 1, "", methodObject ),
+ new Among ( "each", 7, 1, "", methodObject ),
+ new Among ( "\u00F3ideach", 8, 6, "", methodObject ),
+ new Among ( "gineach", 8, 3, "", methodObject ),
+ new Among ( "patach", 7, 5, "", methodObject ),
+ new Among ( "grafa\u00EDoch", -1, 4, "", methodObject ),
+ new Among ( "pataigh", -1, 5, "", methodObject ),
+ new Among ( "\u00F3idigh", -1, 6, "", methodObject ),
+ new Among ( "acht\u00FAil", -1, 1, "", methodObject ),
+ new Among ( "eacht\u00FAil", 15, 1, "", methodObject ),
+ new Among ( "gineas", -1, 3, "", methodObject ),
+ new Among ( "ginis", -1, 3, "", methodObject ),
+ new Among ( "acht", -1, 1, "", methodObject ),
+ new Among ( "arcacht", 19, 2, "", methodObject ),
+ new Among ( "eacht", 19, 1, "", methodObject ),
+ new Among ( "grafa\u00EDocht", -1, 4, "", methodObject ),
+ new Among ( "arcachta\u00ED", -1, 2, "", methodObject ),
+ new Among ( "grafa\u00EDochta\u00ED", -1, 4, "", methodObject )
+ };
+
+ private final static Among a_3[] = {
+ new Among ( "imid", -1, 1, "", methodObject ),
+ new Among ( "aimid", 0, 1, "", methodObject ),
+ new Among ( "\u00EDmid", -1, 1, "", methodObject ),
+ new Among ( "a\u00EDmid", 2, 1, "", methodObject ),
+ new Among ( "adh", -1, 2, "", methodObject ),
+ new Among ( "eadh", 4, 2, "", methodObject ),
+ new Among ( "faidh", -1, 1, "", methodObject ),
+ new Among ( "fidh", -1, 1, "", methodObject ),
+ new Among ( "\u00E1il", -1, 2, "", methodObject ),
+ new Among ( "ain", -1, 2, "", methodObject ),
+ new Among ( "tear", -1, 2, "", methodObject ),
+ new Among ( "tar", -1, 2, "", methodObject )
+ };
+
+ private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 2 };
+
+ private int I_p2;
+ private int I_p1;
+ private int I_pV;
+
+ private void copy_from(irishStemmer other) {
+ I_p2 = other.I_p2;
+ I_p1 = other.I_p1;
+ I_pV = other.I_pV;
+ super.copy_from(other);
+ }
+
+ private boolean r_mark_regions() {
+ int v_1;
+ int v_3;
+ // (, line 28
+ I_pV = limit;
+ I_p1 = limit;
+ I_p2 = limit;
+ // do, line 34
+ v_1 = cursor;
+ lab0: do {
+ // (, line 34
+ // gopast, line 35
+ golab1: while(true)
+ {
+ lab2: do {
+ if (!(in_grouping(g_v, 97, 250)))
+ {
+ break lab2;
+ }
+ break golab1;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab0;
+ }
+ cursor++;
+ }
+ // setmark pV, line 35
+ I_pV = cursor;
+ } while (false);
+ cursor = v_1;
+ // do, line 37
+ v_3 = cursor;
+ lab3: do {
+ // (, line 37
+ // gopast, line 38
+ golab4: while(true)
+ {
+ lab5: do {
+ if (!(in_grouping(g_v, 97, 250)))
+ {
+ break lab5;
+ }
+ break golab4;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // gopast, line 38
+ golab6: while(true)
+ {
+ lab7: do {
+ if (!(out_grouping(g_v, 97, 250)))
+ {
+ break lab7;
+ }
+ break golab6;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // setmark p1, line 38
+ I_p1 = cursor;
+ // gopast, line 39
+ golab8: while(true)
+ {
+ lab9: do {
+ if (!(in_grouping(g_v, 97, 250)))
+ {
+ break lab9;
+ }
+ break golab8;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // gopast, line 39
+ golab10: while(true)
+ {
+ lab11: do {
+ if (!(out_grouping(g_v, 97, 250)))
+ {
+ break lab11;
+ }
+ break golab10;
+ } while (false);
+ if (cursor >= limit)
+ {
+ break lab3;
+ }
+ cursor++;
+ }
+ // setmark p2, line 39
+ I_p2 = cursor;
+ } while (false);
+ cursor = v_3;
+ return true;
+ }
+
+ private boolean r_initial_morph() {
+ int among_var;
+ // (, line 43
+ // [, line 44
+ bra = cursor;
+ // substring, line 44
+ among_var = find_among(a_0, 24);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 44
+ ket = cursor;
+ switch (among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 46
+ // delete, line 46
+ slice_del();
+ break;
+ case 2:
+ // (, line 50
+ // delete, line 50
+ slice_del();
+ break;
+ case 3:
+ // (, line 52
+ // <-, line 52
+ slice_from("f");
+ break;
+ case 4:
+ // (, line 55
+ // delete, line 55
+ slice_del();
+ break;
+ case 5:
+ // (, line 58
+ // <-, line 58
+ slice_from("s");
+ break;
+ case 6:
+ // (, line 61
+ // <-, line 61
+ slice_from("b");
+ break;
+ case 7:
+ // (, line 63
+ // <-, line 63
+ slice_from("c");
+ break;
+ case 8:
+ // (, line 65
+ // <-, line 65
+ slice_from("d");
+ break;
+ case 9:
+ // (, line 67
+ // <-, line 67
+ slice_from("f");
+ break;
+ case 10:
+ // (, line 69
+ // <-, line 69
+ slice_from("g");
+ break;
+ case 11:
+ // (, line 71
+ // <-, line 71
+ slice_from("p");
+ break;
+ case 12:
+ // (, line 73
+ // <-, line 73
+ slice_from("s");
+ break;
+ case 13:
+ // (, line 75
+ // <-, line 75
+ slice_from("t");
+ break;
+ case 14:
+ // (, line 79
+ // <-, line 79
+ slice_from("b");
+ break;
+ case 15:
+ // (, line 81
+ // <-, line 81
+ slice_from("c");
+ break;
+ case 16:
+ // (, line 83
+ // <-, line 83
+ slice_from("d");
+ break;
+ case 17:
+ // (, line 85
+ // <-, line 85
+ slice_from("f");
+ break;
+ case 18:
+ // (, line 87
+ // <-, line 87
+ slice_from("g");
+ break;
+ case 19:
+ // (, line 89
+ // <-, line 89
+ slice_from("m");
+ break;
+ case 20:
+ // (, line 91
+ // <-, line 91
+ slice_from("p");
+ break;
+ case 21:
+ // (, line 93
+ // <-, line 93
+ slice_from("t");
+ break;
+ }
+ return true;
+ }
+
+ private boolean r_RV() {
+ if (!(I_pV <= cursor))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean r_R1() {
+ if (!(I_p1 <= cursor))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean r_R2() {
+ if (!(I_p2 <= cursor))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean r_noun_sfx() {
+ int among_var;
+ // (, line 103
+ // [, line 104
+ ket = cursor;
+ // substring, line 104
+ among_var = find_among_b(a_1, 16);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 104
+ bra = cursor;
+ switch (among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 108
+ // call R1, line 108
+ if (!r_R1())
+ {
+ return false;
+ }
+ // delete, line 108
+ slice_del();
+ break;
+ case 2:
+ // (, line 110
+ // call R2, line 110
+ if (!r_R2())
+ {
+ return false;
+ }
+ // delete, line 110
+ slice_del();
+ break;
+ }
+ return true;
+ }
+
+ private boolean r_deriv() {
+ int among_var;
+ // (, line 113
+ // [, line 114
+ ket = cursor;
+ // substring, line 114
+ among_var = find_among_b(a_2, 25);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 114
+ bra = cursor;
+ switch (among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 116
+ // call R2, line 116
+ if (!r_R2())
+ {
+ return false;
+ }
+ // delete, line 116
+ slice_del();
+ break;
+ case 2:
+ // (, line 118
+ // <-, line 118
+ slice_from("arc");
+ break;
+ case 3:
+ // (, line 120
+ // <-, line 120
+ slice_from("gin");
+ break;
+ case 4:
+ // (, line 122
+ // <-, line 122
+ slice_from("graf");
+ break;
+ case 5:
+ // (, line 124
+ // <-, line 124
+ slice_from("paite");
+ break;
+ case 6:
+ // (, line 126
+ // <-, line 126
+ slice_from("\u00F3id");
+ break;
+ }
+ return true;
+ }
+
+ private boolean r_verb_sfx() {
+ int among_var;
+ // (, line 129
+ // [, line 130
+ ket = cursor;
+ // substring, line 130
+ among_var = find_among_b(a_3, 12);
+ if (among_var == 0)
+ {
+ return false;
+ }
+ // ], line 130
+ bra = cursor;
+ switch (among_var) {
+ case 0:
+ return false;
+ case 1:
+ // (, line 133
+ // call RV, line 133
+ if (!r_RV())
+ {
+ return false;
+ }
+ // delete, line 133
+ slice_del();
+ break;
+ case 2:
+ // (, line 138
+ // call R1, line 138
+ if (!r_R1())
+ {
+ return false;
+ }
+ // delete, line 138
+ slice_del();
+ break;
+ }
+ return true;
+ }
+
+ public boolean stem() {
+ int v_1;
+ int v_2;
+ int v_3;
+ int v_4;
+ int v_5;
+ // (, line 143
+ // do, line 144
+ v_1 = cursor;
+ lab0: do {
+ // call initial_morph, line 144
+ if (!r_initial_morph())
+ {
+ break lab0;
+ }
+ } while (false);
+ cursor = v_1;
+ // do, line 145
+ v_2 = cursor;
+ lab1: do {
+ // call mark_regions, line 145
+ if (!r_mark_regions())
+ {
+ break lab1;
+ }
+ } while (false);
+ cursor = v_2;
+ // backwards, line 146
+ limit_backward = cursor; cursor = limit;
+ // (, line 146
+ // do, line 147
+ v_3 = limit - cursor;
+ lab2: do {
+ // call noun_sfx, line 147
+ if (!r_noun_sfx())
+ {
+ break lab2;
+ }
+ } while (false);
+ cursor = limit - v_3;
+ // do, line 148
+ v_4 = limit - cursor;
+ lab3: do {
+ // call deriv, line 148
+ if (!r_deriv())
+ {
+ break lab3;
+ }
+ } while (false);
+ cursor = limit - v_4;
+ // do, line 149
+ v_5 = limit - cursor;
+ lab4: do {
+ // call verb_sfx, line 149
+ if (!r_verb_sfx())
+ {
+ break lab4;
+ }
+ } while (false);
+ cursor = limit - v_5;
+ cursor = limit_backward; return true;
+ }
+
+ public boolean equals( Object o ) {
+ return o instanceof irishStemmer;
+ }
+
+ public int hashCode() {
+ return irishStemmer.class.getName().hashCode();
+ }
+
+
+
+}
+
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java b/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
index dad1fa0..6396b2f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
@@ -89,6 +89,15 @@ public class SnowballStemmerTest {
}
@Test
+ public void testIrish() {
+ SnowballStemmer stemmer = new SnowballStemmer(ALGORITHM.IRISH);
+ Assert.assertEquals(stemmer.stem("bhfeidhm"), "feidhm");
+ Assert.assertEquals(stemmer.stem("feirmeoireacht"), "feirmeoir");
+ Assert.assertEquals(stemmer.stem("monarcacht"), "monarc");
+
+ }
+
+ @Test
public void testItalian() {
SnowballStemmer stemmer = new SnowballStemmer(ALGORITHM.ITALIAN);
Assert.assertEquals(stemmer.stem("abbattimento"), "abbatt");