You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/08 15:45:23 UTC
svn commit: r1575549 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/
Author: rmuir
Date: Sat Mar 8 14:45:22 2014
New Revision: 1575549
URL: http://svn.apache.org/r1575549
Log:
LUCENE-5497: fix hunspell escaping/optional conditions
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
- copied unchanged from r1575548, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestOptionalCondition.java
- copied unchanged from r1575548, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestOptionalCondition.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
- copied unchanged from r1575548, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic
- copied unchanged from r1575548, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/optional-condition.aff
- copied unchanged from r1575548, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/optional-condition.aff
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1575549&r1=1575548&r2=1575549&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sat Mar 8 14:45:22 2014
@@ -84,6 +84,9 @@ Bug fixes
recursive affix application are driven correctly by continuation classes in the affix file.
(Robert Muir)
+* LUCENE-5497: HunspellStemFilter properly handles escaped terms and affixes without conditions.
+ (Robert Muir)
+
Test Framework
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1575549&r1=1575548&r2=1575549&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Sat Mar 8 14:45:22 2014
@@ -348,8 +348,10 @@ public class Dictionary {
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
- if (ruleArgs.length < 5) {
- throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
+ // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
+ // condition is optional
+ if (ruleArgs.length < 4) {
+ throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
}
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
@@ -370,7 +372,7 @@ public class Dictionary {
Arrays.sort(appendFlags);
}
- String condition = ruleArgs[4];
+ String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue
if (condition.startsWith("[") && !condition.endsWith("]")) {
condition = condition + "]";
@@ -550,6 +552,24 @@ public class Dictionary {
throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
+ final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
+
+ String unescapeEntry(String entry) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < entry.length(); i++) {
+ char ch = entry.charAt(i);
+ if (ch == '\\' && i+1 < entry.length()) {
+ sb.append(entry.charAt(i+1));
+ i++;
+ } else if (ch == '/') {
+ sb.append(FLAG_SEPARATOR);
+ } else {
+ sb.append(ch);
+ }
+ }
+ return sb.toString();
+ }
+
/**
* Reads the dictionary file through the provided InputStreams, building up the words map
*
@@ -572,8 +592,9 @@ public class Dictionary {
String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
while ((line = lines.readLine()) != null) {
+ line = unescapeEntry(line);
if (needsInputCleaning) {
- int flagSep = line.lastIndexOf('/');
+ int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(IOUtils.CHARSET_UTF_8));
@@ -613,7 +634,7 @@ public class Dictionary {
scratch1.length = o1.length;
for (int i = scratch1.length - 1; i >= 0; i--) {
- if (scratch1.bytes[scratch1.offset + i] == '/') {
+ if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) {
scratch1.length = i;
break;
}
@@ -624,7 +645,7 @@ public class Dictionary {
scratch2.length = o2.length;
for (int i = scratch2.length - 1; i >= 0; i--) {
- if (scratch2.bytes[scratch2.offset + i] == '/') {
+ if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) {
scratch2.length = i;
break;
}
@@ -657,7 +678,7 @@ public class Dictionary {
String entry;
char wordForm[];
- int flagSep = line.lastIndexOf('/');
+ int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
entry = line;
@@ -837,6 +858,9 @@ public class Dictionary {
}
StringBuilder builder = new StringBuilder();
+ if (rawFlags.length() % 2 == 1) {
+ throw new IllegalArgumentException("Invalid flags (should be even number of characters): " + rawFlags);
+ }
for (int i = 0; i < rawFlags.length(); i+=2) {
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
builder.append(cookedFlag);
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java?rev=1575549&r1=1575548&r2=1575549&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java Sat Mar 8 14:45:22 2014
@@ -55,7 +55,7 @@ abstract class StemmerTestBase extends L
}
try {
- Dictionary dictionary = new Dictionary(affixStream, Arrays.asList(dictStreams), true);
+ Dictionary dictionary = new Dictionary(affixStream, Arrays.asList(dictStreams), ignoreCase);
stemmer = new Stemmer(dictionary);
} finally {
IOUtils.closeWhileHandlingException(affixStream);
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java?rev=1575549&r1=1575548&r2=1575549&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java Sat Mar 8 14:45:22 2014
@@ -101,10 +101,10 @@ public class TestAllDictionaries2 extend
"hausa_spelling_dictionary-0.2-tb+fx.xpi", "dictionaries/ha-GH.dic", "dictionaries/ha-GH.aff",
"hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi", "dictionaries/he.dic", "dictionaries/he.aff",
"hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi", "dictionaries/hi_IN.dic", "dictionaries/hi_IN.aff",
-//BUG! "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu_HU.dic", "dictionaries/hu_HU.aff",
+//BUG! "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff",
//BUG! "icelandic_dictionary-1.3-fx+tb+sm.xpi", "dictionaries/is.dic", "dictionaries/is.aff",
"kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi", "dictionaries/id.dic", "dictionaries/id.aff",
-//BUG! "kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff",
+ "kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff",
"kashubian_spell_checker_poland-0.9-sm+tb+fx.xpi", "dictionaries/Kaszebsczi.dic", "dictionaries/Kaszebsczi.aff",
"kiswahili_spell_checker-0.3-sb+tb+fn+fx+sm.xpi", "dictionaries/sw_TZ.dic", "dictionaries/sw_TZ.aff",
"kurdish_spell_checker-0.96-fx+tb+sm.xpi", "dictionaries/ku-TR.dic", "dictionaries/ku-TR.aff",
@@ -125,7 +125,7 @@ public class TestAllDictionaries2 extend
"oriya_spell_checker-0.3-fn+tb+fx+sm+sb.xpi", "dictionaries/or-IN.dic", "dictionaries/or-IN.aff",
"polski_slownik_poprawnej_pisowni-1.0.20110621-fx+tb+sm.xpi", "dictionaries/pl.dic", "dictionaries/pl.aff",
"punjabi_spell_checker-0.3-fx+tb+sm+sb+fn.xpi", "dictionaries/pa-IN.dic", "dictionaries/pa-IN.aff",
-//BUG! "romanian_spellchecking_dictionary-1.14-sm+tb+fx.xpi", "dictionaries/ro_RO-ante1993.dic", "dictionaries/ro_RO-ante1993.aff",
+ "romanian_spellchecking_dictionary-1.14-sm+tb+fx.xpi", "dictionaries/ro_RO-ante1993.dic", "dictionaries/ro_RO-ante1993.aff",
//BUG! "russian_hunspell_dictionary-1.0.20131101-tb+sm+fn+fx.xpi", "dictionaries/ru_RU.dic", "dictionaries/ru_RU.aff",
"sanskrit_spell_checker-1.1-fx+tb+sm+sb+fn.xpi", "dictionaries/sa_IN.dic", "dictionaries/sa_IN.aff",
"scottish_gaelic_spell_checker-2.7-tb+fx+sm.xpi", "dictionaries/gd-GB.dic", "dictionaries/gd-GB.aff",
@@ -161,7 +161,7 @@ public class TestAllDictionaries2 extend
"verificador_ortografico_para_portugues_do_brasil-2.3-3.2b1-tb+sm+fn+fx.xpi", "dictionaries/pt_BR.dic", "dictionaries/pt_BR.aff",
"vietnamese_dictionary-2.1.0.159-an+sm+tb+fx+fn.xpi", "dictionaries/vi-DauCu.dic", "dictionaries/vi-DauCu.aff",
"vietnamese_dictionary-2.1.0.159-an+sm+tb+fx+fn.xpi", "dictionaries/vi-DauMoi.dic", "dictionaries/vi-DauMoi.aff",
-//BUG! "woordenboek_nederlands-3.1.1-sm+tb+fx+fn.xpi", "dictionaries/nl.dic", "dictionaries/nl.aff",
+ "woordenboek_nederlands-3.1.1-sm+tb+fx+fn.xpi", "dictionaries/nl.dic", "dictionaries/nl.aff",
"xhosa_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/xh-ZA.dic", "dictionaries/xh-ZA.aff",
"xuxen-4.0.1-fx+tb+sm.xpi", "dictionaries/eu.dic", "dictionaries/eu.aff",
"yiddish_spell_checker_yivo-0.0.3-sm+fn+fx+tb.xpi", "dictionaries/yi.dic", "dictionaries/yi.aff",
@@ -202,7 +202,7 @@ public class TestAllDictionaries2 extend
}
public void testOneDictionary() throws Exception {
- String toTest = "hausa_spelling_dictionary-0.2-tb+fx.xpi";
+ String toTest = "woordenboek_nederlands-3.1.1-sm+tb+fx+fn.xpi";
for (int i = 0; i < tests.length; i++) {
if (tests[i].equals(toTest)) {
File f = new File(DICTIONARY_HOME, tests[i]);
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java?rev=1575549&r1=1575548&r2=1575549&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java Sat Mar 8 14:45:22 2014
@@ -87,7 +87,7 @@ public class TestDictionary extends Luce
new Dictionary(affixStream, dictStream);
fail("didn't get expected exception");
} catch (ParseException expected) {
- assertEquals("The affix file contains a rule with less than five elements", expected.getMessage());
+ assertTrue(expected.getMessage().startsWith("The affix file contains a rule with less than four elements"));
assertEquals(24, expected.getErrorOffset());
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff?rev=1575549&r1=1575548&r2=1575549&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff Sat Mar 8 14:45:22 2014
@@ -19,6 +19,6 @@ SFX E 0 d o
PFX B Y 1
PFX B 0 s o
-#wrong rule (only 4 elements)
+#wrong rule (only 3 elements)
PFX A0 Y 1
-PFX A0 0 a
\ No newline at end of file
+PFX A0 0
\ No newline at end of file