You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/07/21 19:07:49 UTC

svn commit: r1612349 - in /lucene/dev/trunk/lucene: CHANGES.txt analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java

Author: rmuir
Date: Mon Jul 21 17:07:48 2014
New Revision: 1612349

URL: http://svn.apache.org/r1612349
Log:
LUCENE-5838: fix hunspell when .aff file has over 64k affixes

Added:
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1612349&r1=1612348&r2=1612349&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Jul 21 17:07:48 2014
@@ -195,6 +195,8 @@ Bug Fixes
 * LUCENE-5827: Make all Directory implementations correctly fail with
   IllegalArgumentException if slices are out of bounds.  (Uwe SChindler)
 
+* LUCENE-5838: Fix hunspell when the .aff file has over 64k affixes. (Robert Muir)
+
 Test Framework
 
 * LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1612349&r1=1612348&r2=1612349&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Mon Jul 21 17:07:48 2014
@@ -289,8 +289,8 @@ public class Dictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    TreeMap<String, List<Character>> prefixes = new TreeMap<>();
-    TreeMap<String, List<Character>> suffixes = new TreeMap<>();
+    TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
+    TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
     Map<String,Integer> seenPatterns = new HashMap<>();
     
     // zero condition -> 0 ord
@@ -397,16 +397,15 @@ public class Dictionary {
     stripOffsets[currentIndex] = currentOffset;
   }
   
-  private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
+  private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
     IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
     Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
-    
     IntsRef scratch = new IntsRef();
-    for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
+    for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
       Util.toUTF32(entry.getKey(), scratch);
-      List<Character> entries = entry.getValue();
+      List<Integer> entries = entry.getValue();
       IntsRef output = new IntsRef(entries.size());
-      for (Character c : entries) {
+      for (Integer c : entries) {
         output.ints[output.length++] = c;
       }
       builder.add(scratch, output);
@@ -444,7 +443,7 @@ public class Dictionary {
    * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
-  private void parseAffix(TreeMap<String,List<Character>> affixes,
+  private void parseAffix(TreeMap<String,List<Integer>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
@@ -564,13 +563,12 @@ public class Dictionary {
         affixArg = new StringBuilder(affixArg).reverse().toString();
       }
       
-      List<Character> list = affixes.get(affixArg);
+      List<Integer> list = affixes.get(affixArg);
       if (list == null) {
         list = new ArrayList<>();
         affixes.put(affixArg, list);
       }
-      
-      list.add((char)currentAffix);
+      list.add(currentAffix);
       currentAffix++;
     }
   }

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java?rev=1612349&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java Mon Jul 21 17:07:48 2014
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+/** Tests that > 64k affixes actually works and doesnt overflow some internal int */
+public class Test64kAffixes extends LuceneTestCase {
+  
+  public void test() throws Exception {
+    File tempDir = createTempDir("64kaffixes");
+    File affix = new File(tempDir, "64kaffixes.aff");
+    File dict = new File(tempDir, "64kaffixes.dic");
+    
+    BufferedWriter affixWriter = new BufferedWriter(
+                                 new OutputStreamWriter(
+                                 new FileOutputStream(affix), StandardCharsets.UTF_8));
+    
+    // 65k affixes with flag 1, then an affix with flag 2
+    affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
+    for (int i = 0; i < 65536; i++) {
+      affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
+    }
+    affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
+    affixWriter.close();
+    
+    BufferedWriter dictWriter = new BufferedWriter(
+                                new OutputStreamWriter(
+                                new FileOutputStream(dict), StandardCharsets.UTF_8));
+    
+    // drink signed with affix 2 (takes -s)
+    dictWriter.write("1\ndrink/2\n");
+    dictWriter.close();
+    
+    try (InputStream affStream = new FileInputStream(affix); InputStream dictStream = new FileInputStream(dict)) {
+      Dictionary dictionary = new Dictionary(affStream, dictStream);
+      Stemmer stemmer = new Stemmer(dictionary);
+      // drinks should still stem to drink
+      List<CharsRef> stems = stemmer.stem("drinks");
+      assertEquals(1, stems.size());
+      assertEquals("drink", stems.get(0).toString());
+    }
+  }
+}