You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/07/15 13:02:32 UTC

svn commit: r1610646 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/

Author: rmuir
Date: Tue Jul 15 11:02:31 2014
New Revision: 1610646

URL: http://svn.apache.org/r1610646
Log:
LUCENE-5823: recognize hunspell FULLSTRIP option in the affix file

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java
      - copied unchanged from r1610644, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff
      - copied unchanged from r1610644, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic
      - copied unchanged from r1610644, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1610646&r1=1610645&r2=1610646&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Jul 15 11:02:31 2014
@@ -72,8 +72,9 @@ Bug Fixes
 * LUCENE-5817: Fix hunspell zero-affix handling: previously only zero-strips worked
   correctly.  (Robert Muir)
 
-* LUCENE-5818: Fix hunspell overgeneration for short strings that also match affixes.
-  (Robert Muir)
+* LUCENE-5818, LUCENE-5823: Fix hunspell overgeneration for short strings that also 
+  match affixes, words are only stripped to a zero-length string if FULLSTRIP option
+  is specifed in the dictionary.  (Robert Muir)
 
 Test Framework
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1610646&r1=1610645&r2=1610646&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Tue Jul 15 11:02:31 2014
@@ -84,6 +84,7 @@ public class Dictionary {
   private static final String IGNORE_KEY = "IGNORE";
   private static final String ICONV_KEY = "ICONV";
   private static final String OCONV_KEY = "OCONV";
+  private static final String FULLSTRIP_KEY = "FULLSTRIP";
 
   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -150,6 +151,9 @@ public class Dictionary {
   boolean needsInputCleaning;
   boolean needsOutputCleaning;
   
+  // true if we can strip suffixes "down to nothing"
+  boolean fullStrip;
+  
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
@@ -334,6 +338,8 @@ public class Dictionary {
           oconv = res;
           needsOutputCleaning |= oconv != null;
         }
+      } else if (line.startsWith(FULLSTRIP_KEY)) {
+        fullStrip = true;
       }
     }
     

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1610646&r1=1610645&r2=1610646&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Tue Jul 15 11:02:31 2014
@@ -218,7 +218,8 @@ final class Stemmer {
       fst.getFirstArc(arc);
       IntsRef NO_OUTPUT = outputs.getNoOutput();
       IntsRef output = NO_OUTPUT;
-      for (int i = 0; i < length-1; i++) {
+      int limit = dictionary.fullStrip ? length : length-1;
+      for (int i = 0; i < limit; i++) {
         if (i > 0) {
           int ch = word[i-1];
           if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
@@ -292,7 +293,8 @@ final class Stemmer {
       fst.getFirstArc(arc);
       IntsRef NO_OUTPUT = outputs.getNoOutput();
       IntsRef output = NO_OUTPUT;
-      for (int i = length; i > 0; i--) {
+      int limit = dictionary.fullStrip ? 0 : 1;
+      for (int i = length; i >= limit; i--) {
         if (i < length) {
           int ch = word[i];
           if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {