You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/11/30 20:53:14 UTC

[lucene] branch main updated: LUCENE-10248: Spanish Plural Stemmer (#461)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new edb936f  LUCENE-10248: Spanish Plural Stemmer (#461)
edb936f is described below

commit edb936f09055cc16a71b5b037bef77177786759b
Author: Xavier Sanchez Loro <xa...@wallapop.com>
AuthorDate: Tue Nov 30 21:51:10 2021 +0100

    LUCENE-10248: Spanish Plural Stemmer (#461)
    
    Adds a new Spanish stemmer just for stemming plural to singular whilst maintaining gender: the SpanishPluralStemmer. The goal is to provide a lightweight algorithmic approach with better precision and recall than current approaches.
    
    See blog post for more details: https://medium.com/inside-wallapop/spanish-plural-stemmer-matching-plural-and-singular-forms-in-spanish-using-lucene-93e005e38373
    
    This approach is based on rules specified in WikiLingua: http://www.wikilengua.org/index.php/Plural_(formaci%C3%B3n)
    
    Some characteristics:
    
    * Designed to stem just plural to singular form
    * Distinguishes between masculine and feminine forms
    * It will increase recall but precision can be reduced depending on the use case/information need
    * Stems plural words of foreign origin: i.e. complots, bits, punks, robots
    * Support for invariant words: same plural and singular form or plural does not make sense: i.e. crisis, jueves, lapsus, abrebotellas, etc
    * Support for special cases: i.e. yoes, clubes, itemes, faralaes
    * Use it when the distinction between singular and plural is not relevant but gender is relevant
    * Produces meaningful tokens in form of singular
    * Not strange stems like “amig”: it’s true that stemmers must not generate grammatically correct tokens, but if we generate correct stems we decrease the possibility of collisions with other words
---
 .../analysis/es/SpanishPluralStemFilter.java       |  54 ++++
 .../es/SpanishPluralStemFilterFactory.java         |  58 +++++
 .../lucene/analysis/es/SpanishPluralStemmer.java   | 285 +++++++++++++++++++++
 .../org.apache.lucene.analysis.TokenFilterFactory  |   1 +
 .../analysis/es/TestSpanishPluralStemFilter.java   |  73 ++++++
 .../es/TestSpanishPluralStemFilterFactory.java     |  46 ++++
 .../apache/lucene/analysis/es/espluraltestdata.zip | Bin 0 -> 133046 bytes
 7 files changed, 517 insertions(+)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilter.java
new file mode 100644
index 0000000..78e189e
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SpanishPluralStemmer} to stem Spanish words.
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ */
+public final class SpanishPluralStemFilter extends TokenFilter {
+  private final SpanishPluralStemmer stemmer = new SpanishPluralStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public SpanishPluralStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilterFactory.java
new file mode 100644
index 0000000..6063601
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilterFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link SpanishPluralStemFilterFactory}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.SpanishPluralStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class SpanishPluralStemFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "spanishPluralStem";
+
+  /** Default ctor for compatibility with SPI */
+  public SpanishPluralStemFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  public SpanishPluralStemFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new SpanishPluralStemFilter(input);
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemmer.java
new file mode 100644
index 0000000..cc76a24
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemmer.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.util.Arrays;
+import java.util.List;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Plural Stemmer for Spanish
+ *
+ * <p>This stemmer implements the rules described in:
+ * <i>http://www.wikilengua.org/index.php/Plural_(formación)</i>
+ */
+public class SpanishPluralStemmer {
+
+  private static final CharArraySet invariants;
+  private static final CharArraySet specialCases;
+
+  private static final List<String> invariantsList =
+      Arrays.asList(
+          "abrebotellas",
+          "abrecartas",
+          "abrelatas",
+          "afueras",
+          "albatros",
+          "albricias",
+          "aledaños",
+          "alexis",
+          "aries",
+          "alicates",
+          "analisis",
+          "andurriales",
+          "antitesis",
+          "añicos",
+          "apendicitis",
+          "apocalipsis",
+          "arcoiris",
+          "aries",
+          "bilis",
+          "boletus",
+          "boris",
+          "brindis",
+          "cactus",
+          "canutas",
+          "caries",
+          "cascanueces",
+          "cascarrabias",
+          "ciempies",
+          "cifosis",
+          "cortaplumas",
+          "corpus",
+          "cosmos",
+          "cosquillas",
+          "creces",
+          "crisis",
+          "cuatrocientas",
+          "cuatrocientos",
+          "cuelgacapas",
+          "cuentacuentos",
+          "cuentapasos",
+          "cumpleaños",
+          "doscientas",
+          "doscientos",
+          "dosis",
+          "enseres",
+          "entonces",
+          "esponsales",
+          "estatus",
+          "exequias",
+          "fauces",
+          "forceps",
+          "fotosintesis",
+          "gafas",
+          "gafotas",
+          "gargaras",
+          "gris",
+          "honorarios",
+          "ictus",
+          "jueves",
+          "lapsus",
+          "lavacoches",
+          "lavaplatos",
+          "limpiabotas",
+          "lunes",
+          "maitines",
+          "martes",
+          "mondadientes",
+          "novecientas",
+          "novecientos",
+          "nupcias",
+          "ochocientas",
+          "ochocientos",
+          "pais",
+          "paris",
+          "parabrisas",
+          "paracaidas",
+          "parachoques",
+          "paraguas",
+          "pararrayos",
+          "pisapapeles",
+          "piscis",
+          "portaaviones",
+          "portamaletas",
+          "portamantas",
+          "quinientas",
+          "quinientos",
+          "quinientos",
+          "quitamanchas",
+          "recogepelotas",
+          "rictus",
+          "rompeolas",
+          "sacacorchos",
+          "sacapuntas",
+          "saltamontes",
+          "salvavidas",
+          "seis",
+          "seiscientas",
+          "seiscientos",
+          "setecientas",
+          "setecientos",
+          "sintesis",
+          "tenis",
+          "tifus",
+          "trabalenguas",
+          "vacaciones",
+          "venus",
+          "versus",
+          "viacrucis",
+          "virus",
+          "viveres",
+          "volandas");
+
+  static {
+    final CharArraySet invariantSet = new CharArraySet(invariantsList, true);
+    invariants = CharArraySet.unmodifiableSet(invariantSet);
+
+    final List<String> specialCasesList =
+        Arrays.asList(
+            "yoes",
+            "noes",
+            "sies",
+            "clubes",
+            "faralaes",
+            "albalaes",
+            "itemes",
+            "albumes",
+            "sandwiches",
+            "relojes",
+            "bojes",
+            "contrarreloj",
+            "carcajes");
+    final CharArraySet sepecialSet = new CharArraySet(specialCasesList, true);
+    specialCases = CharArraySet.unmodifiableSet(sepecialSet);
+  }
+
+  public int stem(char s[], int len) {
+    if (len < 4) return len; // plural have at least 4 letters (ases,eses,etc.)
+    removeAccents(s, len);
+    if (invariant(s, len)) return len;
+    if (special(s, len)) return len - 2;
+    switch (s[len - 1]) {
+      case 's':
+        if (!isVowel(s[len - 2])) { // no vocals, singular words ending with consonant
+          return len - 1;
+        }
+        if ((s[len - 4] == 'q'
+            || (s[len - 4] == 'g')
+                && s[len - 3] == 'u'
+                && (s[len - 2] == 'i' || s[len - 2] == 'e'))) { // maniquis,caquis, parques
+          return len - 1;
+        }
+        if (isVowel(s[len - 4])
+            && (s[len - 3] == 'r')
+            && s[len - 2] == 'e') { // escaneres, alfileres, amores, cables
+          return len - 2;
+        }
+        if (isVowel(s[len - 4])
+            && (s[len - 3] == 'd' || s[len - 3] == 'l' || s[len - 3] == 'n' || s[len - 3] == 'x')
+            && s[len - 2] == 'e') { // abades, comerciales, faxes,  relojes,
+          return len - 2;
+        }
+        if ((s[len - 3] == 'y' || s[len - 3] == 'u') && s[len - 2] == 'e') { // bambues,leyes
+          return len - 2;
+        }
+        if ((s[len - 4] == 'u'
+                || s[len - 4] == 'l'
+                || s[len - 4] == 'r'
+                || s[len - 4] == 't'
+                || s[len - 4] == 'n')
+            && (s[len - 3] == 'i')
+            && s[len - 2] == 'e') { // jabalies,israelies, maniquies
+          return len - 2;
+        }
+        if ((s[len - 3] == 's' && s[len - 2] == 'e')) { // reses
+          return len - 2;
+        }
+        if (isVowel(s[len - 3]) && s[len - 2] == 'i') { // jerseis
+          s[len - 2] = 'y';
+          return len - 1;
+        }
+        if (s[len - 3] == 'd' && s[len - 2] == 'i') { // brandis
+          s[len - 2] = 'y';
+          return len - 1;
+        }
+        if (s[len - 2] == 'e' && s[len - 3] == 'c') { // voces-->voz
+          s[len - 3] = 'z';
+          return len - 2;
+        }
+        if (isVowel(s[len - 2])) // remove last 's': jabalís, casas, coches, etc.
+        {
+          return len - 1;
+        }
+        break;
+    }
+    return len;
+  }
+
+  private boolean isVowel(char c) {
+    boolean res = false;
+    if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') {
+      res = true;
+    }
+    return res;
+  }
+
+  private boolean invariant(char[] s, int len) {
+    return invariants.contains(s, 0, len);
+  }
+
+  private boolean special(char[] s, int len) {
+    return specialCases.contains(s, 0, len);
+  }
+
+  private void removeAccents(char[] s, int len) {
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+        case 'à':
+        case 'á':
+        case 'â':
+        case 'ä':
+          s[i] = 'a';
+          break;
+        case 'ò':
+        case 'ó':
+        case 'ô':
+        case 'ö':
+          s[i] = 'o';
+          break;
+        case 'è':
+        case 'é':
+        case 'ê':
+        case 'ë':
+          s[i] = 'e';
+          break;
+        case 'ù':
+        case 'ú':
+        case 'û':
+        case 'ü':
+          s[i] = 'u';
+          break;
+        case 'ì':
+        case 'í':
+        case 'î':
+        case 'ï':
+          s[i] = 'i';
+          break;
+      }
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index 2328b87..27fac27 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -48,6 +48,7 @@ org.apache.lucene.analysis.en.KStemFilterFactory
 org.apache.lucene.analysis.en.PorterStemFilterFactory
 org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
 org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
+org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
 org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
 org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
 org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilter.java
new file mode 100644
index 0000000..9381b90
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilter.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/** Simple tests for {@link SpanishPluralStemFilter} */
+public class TestSpanishPluralStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    analyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            return new TokenStreamComponents(source, new SpanishPluralStemFilter(source));
+          }
+        };
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    analyzer.close();
+    super.tearDown();
+  }
+
+  /** Test against a vocabulary from the reference impl */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, getDataPath("espluraltestdata.zip"), "esplural.txt");
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+  }
+
+  public void testEmptyTerm() throws IOException {
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new KeywordTokenizer();
+            return new TokenStreamComponents(tokenizer, new SpanishPluralStemFilter(tokenizer));
+          }
+        };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilterFactory.java
new file mode 100644
index 0000000..8abe712
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilterFactory.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.io.Reader;
+import java.io.StringReader;
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/** Simple tests to ensure the Spanish Plural stem factory is working. */
+public class TestSpanishPluralStemFilterFactory extends BaseTokenStreamFactoryTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("sociedades");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer) stream).setReader(reader);
+    stream = tokenFilterFactory("SpanishPluralStem").create(stream);
+    assertTokenStreamContents(stream, new String[] {"sociedad"});
+  }
+
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected =
+        expectThrows(
+            IllegalArgumentException.class,
+            () -> {
+              tokenFilterFactory("SpanishPluralStem", "bogusArg", "bogusValue");
+            });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/espluraltestdata.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/espluraltestdata.zip
new file mode 100644
index 0000000..05ffd65
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/espluraltestdata.zip differ