You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/03 10:16:09 UTC

opennlp git commit: OPENNLP-1048: Add stemmer for Irish

Repository: opennlp
Updated Branches:
  refs/heads/master caeaaeea6 -> 6c2dbf288


OPENNLP-1048: Add stemmer for Irish

Closes #189


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6c2dbf28
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6c2dbf28
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6c2dbf28

Branch: refs/heads/master
Commit: 6c2dbf2885fb4602b8e42bd208ebef66df23329b
Parents: caeaaee
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sat Apr 29 00:15:29 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 12:15:27 2017 +0200

----------------------------------------------------------------------
 .../tools/stemmer/snowball/SnowballStemmer.java |   4 +
 .../tools/stemmer/snowball/irishStemmer.java    | 616 +++++++++++++++++++
 .../tools/stemmer/SnowballStemmerTest.java      |   9 +
 3 files changed, 629 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
index dd75754..86ebe84 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
@@ -29,6 +29,7 @@ public class SnowballStemmer implements Stemmer {
     FRENCH,
     GERMAN,
     HUNGARIAN,
+    IRISH,
     ITALIAN,
     NORWEGIAN,
     PORTER,
@@ -67,6 +68,9 @@ public class SnowballStemmer implements Stemmer {
     else if (ALGORITHM.HUNGARIAN.equals(algorithm)) {
       stemmer = new hungarianStemmer();
     }
+    else if (ALGORITHM.IRISH.equals(algorithm)) {
+      stemmer = new irishStemmer();
+    }
     else if (ALGORITHM.ITALIAN.equals(algorithm)) {
       stemmer = new italianStemmer();
     }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
new file mode 100644
index 0000000..316288f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
@@ -0,0 +1,616 @@
+// CHECKSTYLE:OFF
+/*
+
+Copyright (c) 2001, Dr Martin Porter
+Copyright (c) 2002, Richard Boulton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    * notice, this list of conditions and the following disclaimer in the
+    * documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holders nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+// This file was generated automatically by the Snowball to Java compiler
+
+package opennlp.tools.stemmer.snowball;
+
+ /**
+  * This class was automatically generated by a Snowball to Java compiler
+  * It implements the stemming algorithm defined by a snowball script.
+  */
+
+public class irishStemmer extends opennlp.tools.stemmer.snowball.AbstractSnowballStemmer {
+
+private static final long serialVersionUID = 1L;
+
+        private final static irishStemmer methodObject = new irishStemmer ();
+
+                private final static Among a_0[] = {
+                    new Among ( "b'", -1, 4, "", methodObject ),
+                    new Among ( "bh", -1, 14, "", methodObject ),
+                    new Among ( "bhf", 1, 9, "", methodObject ),
+                    new Among ( "bp", -1, 11, "", methodObject ),
+                    new Among ( "ch", -1, 15, "", methodObject ),
+                    new Among ( "d'", -1, 2, "", methodObject ),
+                    new Among ( "d'fh", 5, 3, "", methodObject ),
+                    new Among ( "dh", -1, 16, "", methodObject ),
+                    new Among ( "dt", -1, 13, "", methodObject ),
+                    new Among ( "fh", -1, 17, "", methodObject ),
+                    new Among ( "gc", -1, 7, "", methodObject ),
+                    new Among ( "gh", -1, 18, "", methodObject ),
+                    new Among ( "h-", -1, 1, "", methodObject ),
+                    new Among ( "m'", -1, 4, "", methodObject ),
+                    new Among ( "mb", -1, 6, "", methodObject ),
+                    new Among ( "mh", -1, 19, "", methodObject ),
+                    new Among ( "n-", -1, 1, "", methodObject ),
+                    new Among ( "nd", -1, 8, "", methodObject ),
+                    new Among ( "ng", -1, 10, "", methodObject ),
+                    new Among ( "ph", -1, 20, "", methodObject ),
+                    new Among ( "sh", -1, 5, "", methodObject ),
+                    new Among ( "t-", -1, 1, "", methodObject ),
+                    new Among ( "th", -1, 21, "", methodObject ),
+                    new Among ( "ts", -1, 12, "", methodObject )
+                };
+
+                private final static Among a_1[] = {
+                    new Among ( "\u00EDochta", -1, 1, "", methodObject ),
+                    new Among ( "a\u00EDochta", 0, 1, "", methodObject ),
+                    new Among ( "ire", -1, 2, "", methodObject ),
+                    new Among ( "aire", 2, 2, "", methodObject ),
+                    new Among ( "abh", -1, 1, "", methodObject ),
+                    new Among ( "eabh", 4, 1, "", methodObject ),
+                    new Among ( "ibh", -1, 1, "", methodObject ),
+                    new Among ( "aibh", 6, 1, "", methodObject ),
+                    new Among ( "amh", -1, 1, "", methodObject ),
+                    new Among ( "eamh", 8, 1, "", methodObject ),
+                    new Among ( "imh", -1, 1, "", methodObject ),
+                    new Among ( "aimh", 10, 1, "", methodObject ),
+                    new Among ( "\u00EDocht", -1, 1, "", methodObject ),
+                    new Among ( "a\u00EDocht", 12, 1, "", methodObject ),
+                    new Among ( "ir\u00ED", -1, 2, "", methodObject ),
+                    new Among ( "air\u00ED", 14, 2, "", methodObject )
+                };
+
+                private final static Among a_2[] = {
+                    new Among ( "\u00F3ideacha", -1, 6, "", methodObject ),
+                    new Among ( "patacha", -1, 5, "", methodObject ),
+                    new Among ( "achta", -1, 1, "", methodObject ),
+                    new Among ( "arcachta", 2, 2, "", methodObject ),
+                    new Among ( "eachta", 2, 1, "", methodObject ),
+                    new Among ( "grafa\u00EDochta", -1, 4, "", methodObject ),
+                    new Among ( "paite", -1, 5, "", methodObject ),
+                    new Among ( "ach", -1, 1, "", methodObject ),
+                    new Among ( "each", 7, 1, "", methodObject ),
+                    new Among ( "\u00F3ideach", 8, 6, "", methodObject ),
+                    new Among ( "gineach", 8, 3, "", methodObject ),
+                    new Among ( "patach", 7, 5, "", methodObject ),
+                    new Among ( "grafa\u00EDoch", -1, 4, "", methodObject ),
+                    new Among ( "pataigh", -1, 5, "", methodObject ),
+                    new Among ( "\u00F3idigh", -1, 6, "", methodObject ),
+                    new Among ( "acht\u00FAil", -1, 1, "", methodObject ),
+                    new Among ( "eacht\u00FAil", 15, 1, "", methodObject ),
+                    new Among ( "gineas", -1, 3, "", methodObject ),
+                    new Among ( "ginis", -1, 3, "", methodObject ),
+                    new Among ( "acht", -1, 1, "", methodObject ),
+                    new Among ( "arcacht", 19, 2, "", methodObject ),
+                    new Among ( "eacht", 19, 1, "", methodObject ),
+                    new Among ( "grafa\u00EDocht", -1, 4, "", methodObject ),
+                    new Among ( "arcachta\u00ED", -1, 2, "", methodObject ),
+                    new Among ( "grafa\u00EDochta\u00ED", -1, 4, "", methodObject )
+                };
+
+                private final static Among a_3[] = {
+                    new Among ( "imid", -1, 1, "", methodObject ),
+                    new Among ( "aimid", 0, 1, "", methodObject ),
+                    new Among ( "\u00EDmid", -1, 1, "", methodObject ),
+                    new Among ( "a\u00EDmid", 2, 1, "", methodObject ),
+                    new Among ( "adh", -1, 2, "", methodObject ),
+                    new Among ( "eadh", 4, 2, "", methodObject ),
+                    new Among ( "faidh", -1, 1, "", methodObject ),
+                    new Among ( "fidh", -1, 1, "", methodObject ),
+                    new Among ( "\u00E1il", -1, 2, "", methodObject ),
+                    new Among ( "ain", -1, 2, "", methodObject ),
+                    new Among ( "tear", -1, 2, "", methodObject ),
+                    new Among ( "tar", -1, 2, "", methodObject )
+                };
+
+                private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 2 };
+
+        private int I_p2;
+        private int I_p1;
+        private int I_pV;
+
+                private void copy_from(irishStemmer other) {
+                    I_p2 = other.I_p2;
+                    I_p1 = other.I_p1;
+                    I_pV = other.I_pV;
+                    super.copy_from(other);
+                }
+
+                private boolean r_mark_regions() {
+            int v_1;
+            int v_3;
+                    // (, line 28
+                    I_pV = limit;
+                    I_p1 = limit;
+                    I_p2 = limit;
+                    // do, line 34
+                    v_1 = cursor;
+                    lab0: do {
+                        // (, line 34
+                        // gopast, line 35
+                        golab1: while(true)
+                        {
+                            lab2: do {
+                                if (!(in_grouping(g_v, 97, 250)))
+                                {
+                                    break lab2;
+                                }
+                                break golab1;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab0;
+                            }
+                            cursor++;
+                        }
+                        // setmark pV, line 35
+                        I_pV = cursor;
+                    } while (false);
+                    cursor = v_1;
+                    // do, line 37
+                    v_3 = cursor;
+                    lab3: do {
+                        // (, line 37
+                        // gopast, line 38
+                        golab4: while(true)
+                        {
+                            lab5: do {
+                                if (!(in_grouping(g_v, 97, 250)))
+                                {
+                                    break lab5;
+                                }
+                                break golab4;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // gopast, line 38
+                        golab6: while(true)
+                        {
+                            lab7: do {
+                                if (!(out_grouping(g_v, 97, 250)))
+                                {
+                                    break lab7;
+                                }
+                                break golab6;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // setmark p1, line 38
+                        I_p1 = cursor;
+                        // gopast, line 39
+                        golab8: while(true)
+                        {
+                            lab9: do {
+                                if (!(in_grouping(g_v, 97, 250)))
+                                {
+                                    break lab9;
+                                }
+                                break golab8;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // gopast, line 39
+                        golab10: while(true)
+                        {
+                            lab11: do {
+                                if (!(out_grouping(g_v, 97, 250)))
+                                {
+                                    break lab11;
+                                }
+                                break golab10;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // setmark p2, line 39
+                        I_p2 = cursor;
+                    } while (false);
+                    cursor = v_3;
+                    return true;
+                }
+
+                private boolean r_initial_morph() {
+            int among_var;
+                    // (, line 43
+                    // [, line 44
+                    bra = cursor;
+                    // substring, line 44
+                    among_var = find_among(a_0, 24);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 44
+                    ket = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 46
+                            // delete, line 46
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 50
+                            // delete, line 50
+                            slice_del();
+                            break;
+                        case 3:
+                            // (, line 52
+                            // <-, line 52
+                            slice_from("f");
+                            break;
+                        case 4:
+                            // (, line 55
+                            // delete, line 55
+                            slice_del();
+                            break;
+                        case 5:
+                            // (, line 58
+                            // <-, line 58
+                            slice_from("s");
+                            break;
+                        case 6:
+                            // (, line 61
+                            // <-, line 61
+                            slice_from("b");
+                            break;
+                        case 7:
+                            // (, line 63
+                            // <-, line 63
+                            slice_from("c");
+                            break;
+                        case 8:
+                            // (, line 65
+                            // <-, line 65
+                            slice_from("d");
+                            break;
+                        case 9:
+                            // (, line 67
+                            // <-, line 67
+                            slice_from("f");
+                            break;
+                        case 10:
+                            // (, line 69
+                            // <-, line 69
+                            slice_from("g");
+                            break;
+                        case 11:
+                            // (, line 71
+                            // <-, line 71
+                            slice_from("p");
+                            break;
+                        case 12:
+                            // (, line 73
+                            // <-, line 73
+                            slice_from("s");
+                            break;
+                        case 13:
+                            // (, line 75
+                            // <-, line 75
+                            slice_from("t");
+                            break;
+                        case 14:
+                            // (, line 79
+                            // <-, line 79
+                            slice_from("b");
+                            break;
+                        case 15:
+                            // (, line 81
+                            // <-, line 81
+                            slice_from("c");
+                            break;
+                        case 16:
+                            // (, line 83
+                            // <-, line 83
+                            slice_from("d");
+                            break;
+                        case 17:
+                            // (, line 85
+                            // <-, line 85
+                            slice_from("f");
+                            break;
+                        case 18:
+                            // (, line 87
+                            // <-, line 87
+                            slice_from("g");
+                            break;
+                        case 19:
+                            // (, line 89
+                            // <-, line 89
+                            slice_from("m");
+                            break;
+                        case 20:
+                            // (, line 91
+                            // <-, line 91
+                            slice_from("p");
+                            break;
+                        case 21:
+                            // (, line 93
+                            // <-, line 93
+                            slice_from("t");
+                            break;
+                    }
+                    return true;
+                }
+
+                private boolean r_RV() {
+                    if (!(I_pV <= cursor))
+                    {
+                        return false;
+                    }
+                    return true;
+                }
+
+                private boolean r_R1() {
+                    if (!(I_p1 <= cursor))
+                    {
+                        return false;
+                    }
+                    return true;
+                }
+
+                private boolean r_R2() {
+                    if (!(I_p2 <= cursor))
+                    {
+                        return false;
+                    }
+                    return true;
+                }
+
+                private boolean r_noun_sfx() {
+            int among_var;
+                    // (, line 103
+                    // [, line 104
+                    ket = cursor;
+                    // substring, line 104
+                    among_var = find_among_b(a_1, 16);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 104
+                    bra = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 108
+                            // call R1, line 108
+                            if (!r_R1())
+                            {
+                                return false;
+                            }
+                            // delete, line 108
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 110
+                            // call R2, line 110
+                            if (!r_R2())
+                            {
+                                return false;
+                            }
+                            // delete, line 110
+                            slice_del();
+                            break;
+                    }
+                    return true;
+                }
+
+                private boolean r_deriv() {
+            int among_var;
+                    // (, line 113
+                    // [, line 114
+                    ket = cursor;
+                    // substring, line 114
+                    among_var = find_among_b(a_2, 25);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 114
+                    bra = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 116
+                            // call R2, line 116
+                            if (!r_R2())
+                            {
+                                return false;
+                            }
+                            // delete, line 116
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 118
+                            // <-, line 118
+                            slice_from("arc");
+                            break;
+                        case 3:
+                            // (, line 120
+                            // <-, line 120
+                            slice_from("gin");
+                            break;
+                        case 4:
+                            // (, line 122
+                            // <-, line 122
+                            slice_from("graf");
+                            break;
+                        case 5:
+                            // (, line 124
+                            // <-, line 124
+                            slice_from("paite");
+                            break;
+                        case 6:
+                            // (, line 126
+                            // <-, line 126
+                            slice_from("\u00F3id");
+                            break;
+                    }
+                    return true;
+                }
+
+                private boolean r_verb_sfx() {
+            int among_var;
+                    // (, line 129
+                    // [, line 130
+                    ket = cursor;
+                    // substring, line 130
+                    among_var = find_among_b(a_3, 12);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 130
+                    bra = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 133
+                            // call RV, line 133
+                            if (!r_RV())
+                            {
+                                return false;
+                            }
+                            // delete, line 133
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 138
+                            // call R1, line 138
+                            if (!r_R1())
+                            {
+                                return false;
+                            }
+                            // delete, line 138
+                            slice_del();
+                            break;
+                    }
+                    return true;
+                }
+
+                public boolean stem() {
+            int v_1;
+            int v_2;
+            int v_3;
+            int v_4;
+            int v_5;
+                    // (, line 143
+                    // do, line 144
+                    v_1 = cursor;
+                    lab0: do {
+                        // call initial_morph, line 144
+                        if (!r_initial_morph())
+                        {
+                            break lab0;
+                        }
+                    } while (false);
+                    cursor = v_1;
+                    // do, line 145
+                    v_2 = cursor;
+                    lab1: do {
+                        // call mark_regions, line 145
+                        if (!r_mark_regions())
+                        {
+                            break lab1;
+                        }
+                    } while (false);
+                    cursor = v_2;
+                    // backwards, line 146
+                    limit_backward = cursor; cursor = limit;
+                    // (, line 146
+                    // do, line 147
+                    v_3 = limit - cursor;
+                    lab2: do {
+                        // call noun_sfx, line 147
+                        if (!r_noun_sfx())
+                        {
+                            break lab2;
+                        }
+                    } while (false);
+                    cursor = limit - v_3;
+                    // do, line 148
+                    v_4 = limit - cursor;
+                    lab3: do {
+                        // call deriv, line 148
+                        if (!r_deriv())
+                        {
+                            break lab3;
+                        }
+                    } while (false);
+                    cursor = limit - v_4;
+                    // do, line 149
+                    v_5 = limit - cursor;
+                    lab4: do {
+                        // call verb_sfx, line 149
+                        if (!r_verb_sfx())
+                        {
+                            break lab4;
+                        }
+                    } while (false);
+                    cursor = limit - v_5;
+                    cursor = limit_backward;                    return true;
+                }
+
+        public boolean equals( Object o ) {
+            return o instanceof irishStemmer;
+        }
+
+        public int hashCode() {
+            return irishStemmer.class.getName().hashCode();
+        }
+
+
+
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java b/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
index dad1fa0..6396b2f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
@@ -89,6 +89,15 @@ public class SnowballStemmerTest {
   }
 
   @Test
+  public void testIrish() {
+    SnowballStemmer stemmer = new SnowballStemmer(ALGORITHM.IRISH);
+    Assert.assertEquals(stemmer.stem("bhfeidhm"), "feidhm");
+    Assert.assertEquals(stemmer.stem("feirmeoireacht"), "feirmeoir");
+    Assert.assertEquals(stemmer.stem("monarcacht"), "monarc");
+
+  }
+
+  @Test
   public void testItalian() {
     SnowballStemmer stemmer = new SnowballStemmer(ALGORITHM.ITALIAN);
     Assert.assertEquals(stemmer.stem("abbattimento"), "abbatt");