You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/07/21 19:07:49 UTC
svn commit: r1612349 - in /lucene/dev/trunk/lucene: CHANGES.txt
analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java
Author: rmuir
Date: Mon Jul 21 17:07:48 2014
New Revision: 1612349
URL: http://svn.apache.org/r1612349
Log:
LUCENE-5838: fix hunspell when .aff file has over 64k affixes
Added:
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1612349&r1=1612348&r2=1612349&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Jul 21 17:07:48 2014
@@ -195,6 +195,8 @@ Bug Fixes
* LUCENE-5827: Make all Directory implementations correctly fail with
IllegalArgumentException if slices are out of bounds. (Uwe SChindler)
+* LUCENE-5838: Fix hunspell when the .aff file has over 64k affixes. (Robert Muir)
+
Test Framework
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1612349&r1=1612348&r2=1612349&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Mon Jul 21 17:07:48 2014
@@ -289,8 +289,8 @@ public class Dictionary {
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
- TreeMap<String, List<Character>> prefixes = new TreeMap<>();
- TreeMap<String, List<Character>> suffixes = new TreeMap<>();
+ TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
+ TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
Map<String,Integer> seenPatterns = new HashMap<>();
// zero condition -> 0 ord
@@ -397,16 +397,15 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
- private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
+ private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
-
IntsRef scratch = new IntsRef();
- for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
+ for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch);
- List<Character> entries = entry.getValue();
+ List<Integer> entries = entry.getValue();
IntsRef output = new IntsRef(entries.size());
- for (Character c : entries) {
+ for (Integer c : entries) {
output.ints[output.length++] = c;
}
builder.add(scratch, output);
@@ -444,7 +443,7 @@ public class Dictionary {
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
- private void parseAffix(TreeMap<String,List<Character>> affixes,
+ private void parseAffix(TreeMap<String,List<Integer>> affixes,
String header,
LineNumberReader reader,
String conditionPattern,
@@ -564,13 +563,12 @@ public class Dictionary {
affixArg = new StringBuilder(affixArg).reverse().toString();
}
- List<Character> list = affixes.get(affixArg);
+ List<Integer> list = affixes.get(affixArg);
if (list == null) {
list = new ArrayList<>();
affixes.put(affixArg, list);
}
-
- list.add((char)currentAffix);
+ list.add(currentAffix);
currentAffix++;
}
}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java?rev=1612349&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java Mon Jul 21 17:07:48 2014
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+/** Tests that > 64k affixes actually works and doesnt overflow some internal int */
+public class Test64kAffixes extends LuceneTestCase {
+
+ public void test() throws Exception {
+ File tempDir = createTempDir("64kaffixes");
+ File affix = new File(tempDir, "64kaffixes.aff");
+ File dict = new File(tempDir, "64kaffixes.dic");
+
+ BufferedWriter affixWriter = new BufferedWriter(
+ new OutputStreamWriter(
+ new FileOutputStream(affix), StandardCharsets.UTF_8));
+
+ // 65k affixes with flag 1, then an affix with flag 2
+ affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
+ for (int i = 0; i < 65536; i++) {
+ affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
+ }
+ affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
+ affixWriter.close();
+
+ BufferedWriter dictWriter = new BufferedWriter(
+ new OutputStreamWriter(
+ new FileOutputStream(dict), StandardCharsets.UTF_8));
+
+ // drink signed with affix 2 (takes -s)
+ dictWriter.write("1\ndrink/2\n");
+ dictWriter.close();
+
+ try (InputStream affStream = new FileInputStream(affix); InputStream dictStream = new FileInputStream(dict)) {
+ Dictionary dictionary = new Dictionary(affStream, dictStream);
+ Stemmer stemmer = new Stemmer(dictionary);
+ // drinks should still stem to drink
+ List<CharsRef> stems = stemmer.stem("drinks");
+ assertEquals(1, stems.size());
+ assertEquals("drink", stems.get(0).toString());
+ }
+ }
+}