You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/22 11:02:14 UTC
[lucene-solr] branch master updated: LUCENE-9684: Hunspell: support
COMPOUNDRULE (#2228)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new d796813 LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
d796813 is described below
commit d7968130c3f5d7166c10756c37b5ed644414cd1d
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 22 12:01:53 2021 +0100
LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
---
lucene/CHANGES.txt | 4 +-
.../lucene/analysis/hunspell/CompoundRule.java | 105 +++++++++++++++++++++
.../lucene/analysis/hunspell/Dictionary.java | 81 +++++++++++++---
.../lucene/analysis/hunspell/SpellChecker.java | 87 ++++++++++++++++-
.../apache/lucene/analysis/hunspell/Stemmer.java | 12 ++-
.../apache/lucene/analysis/hunspell/WordCase.java | 12 +--
.../lucene/analysis/hunspell/SpellCheckerTest.java | 32 +++++++
.../lucene/analysis/hunspell/TestDictionary.java | 23 +++++
.../lucene/analysis/hunspell/compoundrule.aff | 3 +
.../lucene/analysis/hunspell/compoundrule.dic | 5 +
.../lucene/analysis/hunspell/compoundrule.good | 2 +
.../lucene/analysis/hunspell/compoundrule.wrong | 39 ++++++++
.../lucene/analysis/hunspell/compoundrule2.aff | 3 +
.../lucene/analysis/hunspell/compoundrule2.dic | 5 +
.../lucene/analysis/hunspell/compoundrule2.good | 37 ++++++++
.../lucene/analysis/hunspell/compoundrule2.wrong | 8 ++
.../lucene/analysis/hunspell/compoundrule3.aff | 3 +
.../lucene/analysis/hunspell/compoundrule3.dic | 5 +
.../lucene/analysis/hunspell/compoundrule3.good | 7 ++
.../lucene/analysis/hunspell/compoundrule3.wrong | 41 ++++++++
.../lucene/analysis/hunspell/compoundrule4.aff | 7 ++
.../lucene/analysis/hunspell/compoundrule4.dic | 24 +++++
.../lucene/analysis/hunspell/compoundrule4.good | 31 ++++++
.../lucene/analysis/hunspell/compoundrule4.wrong | 5 +
.../lucene/analysis/hunspell/compoundrule5.aff | 7 ++
.../lucene/analysis/hunspell/compoundrule5.dic | 14 +++
.../lucene/analysis/hunspell/compoundrule5.good | 7 ++
.../lucene/analysis/hunspell/compoundrule5.wrong | 1 +
.../lucene/analysis/hunspell/compoundrule6.aff | 4 +
.../lucene/analysis/hunspell/compoundrule6.dic | 5 +
.../lucene/analysis/hunspell/compoundrule6.good | 4 +
.../lucene/analysis/hunspell/compoundrule6.wrong | 4 +
.../lucene/analysis/hunspell/compoundrule7.aff | 8 ++
.../lucene/analysis/hunspell/compoundrule7.dic | 24 +++++
.../lucene/analysis/hunspell/compoundrule7.good | 29 ++++++
.../lucene/analysis/hunspell/compoundrule7.wrong | 5 +
.../lucene/analysis/hunspell/compoundrule8.aff | 8 ++
.../lucene/analysis/hunspell/compoundrule8.dic | 24 +++++
.../lucene/analysis/hunspell/compoundrule8.good | 29 ++++++
.../lucene/analysis/hunspell/compoundrule8.wrong | 5 +
40 files changed, 730 insertions(+), 29 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fce70e9..f99553b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -86,8 +86,8 @@ API Changes
Improvements
-* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
- BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
+* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
+ BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
new file mode 100644
index 0000000..0f89de8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.List;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+class CompoundRule {
+ private final char[] data;
+ private final Dictionary dictionary;
+
+ CompoundRule(String rule, Dictionary dictionary) {
+ this.dictionary = dictionary;
+ StringBuilder parsedFlags = new StringBuilder();
+ int pos = 0;
+ while (pos < rule.length()) {
+ int lParen = rule.indexOf("(", pos);
+ if (lParen < 0) {
+ parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
+ break;
+ }
+
+ parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
+ int rParen = rule.indexOf(')', lParen + 1);
+ if (rParen < 0) {
+ throw new IllegalArgumentException("Unmatched parentheses: " + rule);
+ }
+
+ parsedFlags.append(
+ dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
+ pos = rParen + 1;
+ if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
+ parsedFlags.append(rule.charAt(pos++));
+ }
+ }
+ data = parsedFlags.toString().toCharArray();
+ }
+
+ boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
+ return match(words, 0, 0, scratch, false);
+ }
+
+ boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
+ return match(words, 0, 0, scratch, true);
+ }
+
+ private boolean match(
+ List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
+ if (patternIndex >= data.length) {
+ return wordIndex >= words.size();
+ }
+ if (wordIndex >= words.size() && !fully) {
+ return true;
+ }
+
+ char flag = data[patternIndex];
+ if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
+ int startWI = wordIndex;
+ while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
+ wordIndex++;
+ }
+
+ while (wordIndex >= startWI) {
+ if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
+ return true;
+ }
+
+ wordIndex--;
+ }
+ return false;
+ }
+
+ boolean currentWordMatches =
+ wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
+
+ if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
+ if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
+ return true;
+ }
+ return match(words, patternIndex + 2, wordIndex, scratch, fully);
+ }
+
+ return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
+ }
+
+ @Override
+ public String toString() {
+ return new String(data);
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 19cfaa3..2c620a2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -92,6 +92,8 @@ public class Dictionary {
private static final String LANG_KEY = "LANG";
private static final String BREAK_KEY = "BREAK";
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
+ private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
+ private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
private static final String KEEPCASE_KEY = "KEEPCASE";
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@@ -136,7 +138,7 @@ public class Dictionary {
static final int AFFIX_APPEND = 3;
// Default flag parsing strategy
- private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
+ FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
// AF entries
private String[] aliases;
@@ -163,6 +165,8 @@ public class Dictionary {
int needaffix = -1; // needaffix flag, or -1 if one is not defined
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
+ int compoundMin = 3;
+ List<CompoundRule> compoundRules; // nullable
// ignored characters (dictionary, affix, inputs)
private char[] ignore;
@@ -419,6 +423,18 @@ public class Dictionary {
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
}
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
+ } else if (line.startsWith(COMPOUNDMIN_KEY)) {
+ String[] parts = line.split("\\s+");
+ if (parts.length != 2) {
+ throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
+ }
+ compoundMin = Math.max(1, Integer.parseInt(parts[1]));
+ } else if (line.startsWith(COMPOUNDRULE_KEY)) {
+ String[] parts = line.split("\\s+");
+ if (parts.length != 2) {
+ throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
+ }
+ this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
}
}
@@ -442,6 +458,21 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
+ private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
+ throws IOException, ParseException {
+ String line;
+ List<CompoundRule> compoundRules = new ArrayList<>();
+ for (int i = 0; i < num; i++) {
+ line = reader.readLine();
+ String[] parts = line.split("\\s+");
+ if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
+ throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
+ }
+ compoundRules.add(new CompoundRule(parts[1], this));
+ }
+ return compoundRules;
+ }
+
private Breaks parseBreaks(LineNumberReader reader, String line)
throws IOException, ParseException {
Set<String> starting = new LinkedHashSet<>();
@@ -910,7 +941,7 @@ public class Dictionary {
reuse.append(caseFold(word.charAt(i)));
}
reuse.append(FLAG_SEPARATOR);
- reuse.append(HIDDEN_FLAG);
+ flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
}
@@ -1188,16 +1219,19 @@ public class Dictionary {
return null;
}
- boolean isForbiddenWord(char[] word, BytesRef scratch) {
+ boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
if (forbiddenword != -1) {
- IntsRef forms = lookupWord(word, 0, word.length);
- if (forms != null) {
- int formStep = formStep();
- for (int i = 0; i < forms.length; i += formStep) {
- if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
- return true;
- }
- }
+ IntsRef forms = lookupWord(word, 0, length);
+ return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
+ }
+ return false;
+ }
+
+ boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
+ int formStep = formStep();
+ for (int i = 0; i < forms.length; i += formStep) {
+ if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
+ return true;
}
}
return false;
@@ -1227,6 +1261,8 @@ public class Dictionary {
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
+
+ abstract void appendFlag(char flag, StringBuilder to);
}
/**
@@ -1238,6 +1274,11 @@ public class Dictionary {
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
+
+ @Override
+ void appendFlag(char flag, StringBuilder to) {
+ to.append(flag);
+ }
}
/**
@@ -1266,6 +1307,14 @@ public class Dictionary {
}
return flags;
}
+
+ @Override
+ void appendFlag(char flag, StringBuilder to) {
+ if (to.length() > 0) {
+ to.append(",");
+ }
+ to.append((int) flag);
+ }
}
/**
@@ -1300,6 +1349,16 @@ public class Dictionary {
builder.getChars(0, builder.length(), flags, 0);
return flags;
}
+
+ @Override
+ void appendFlag(char flag, StringBuilder to) {
+ to.append((char) (flag >> 8));
+ to.append((char) (flag & 0xff));
+ }
+ }
+
+ boolean hasCompounding() {
+ return compoundRules != null;
}
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index a3e765b..66e21a1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -16,7 +16,10 @@
*/
package org.apache.lucene.analysis.hunspell;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/**
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
@@ -37,26 +40,100 @@ public class SpellChecker {
public boolean spell(String word) {
if (word.isEmpty()) return true;
- char[] wordChars = word.toCharArray();
- if (dictionary.isForbiddenWord(wordChars, scratch)) {
- return false;
+ if (dictionary.needsInputCleaning) {
+ word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
if (isNumber(word)) {
return true;
}
- if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
+ char[] wordChars = word.toCharArray();
+ if (checkWord(wordChars, wordChars.length, false)) {
return true;
}
- if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
+ WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
+ if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
+ return true;
+ }
+
+ if (dictionary.breaks.isNotEmpty()
+ && !hasTooManyBreakOccurrences(word)
+ && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
return tryBreaks(word);
}
return false;
}
+ private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
+ char[] caseVariant = wordChars;
+ if (wordCase == WordCase.UPPER) {
+ caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
+ if (checkWord(caseVariant, wordChars.length, true)) {
+ return true;
+ }
+ }
+ return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
+ }
+
+ private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
+ if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
+ return false;
+ }
+
+ if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
+ return true;
+ }
+
+ if (dictionary.hasCompounding()) {
+ return checkCompounds(wordChars, 0, length, new ArrayList<>());
+ }
+
+ return false;
+ }
+
+ private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
+ if (words.size() >= 100) return false;
+
+ int limit = length - dictionary.compoundMin + 1;
+ for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
+ IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
+ if (forms != null) {
+ words.add(forms);
+
+ if (dictionary.compoundRules != null
+ && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
+ if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
+ return true;
+ }
+
+ if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
+ return true;
+ }
+ }
+
+ words.remove(words.size() - 1);
+ }
+ }
+
+ return false;
+ }
+
+ private boolean checkLastCompoundPart(
+ char[] wordChars, int start, int length, List<IntsRef> words) {
+ IntsRef forms = dictionary.lookupWord(wordChars, start, length);
+ if (forms == null) return false;
+
+ words.add(forms);
+ boolean result =
+ dictionary.compoundRules != null
+ && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
+ words.remove(words.size() - 1);
+ return result;
+ }
+
private static boolean isNumber(String s) {
int i = 0;
while (i < s.length()) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 1355627..3bb46a7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -112,8 +112,8 @@ final class Stemmer {
private char[] titleBuffer = new char[8];
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
- private WordCase caseOf(char[] word, int length) {
- if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
+ WordCase caseOf(char[] word, int length) {
+ if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
return WordCase.MIXED;
}
@@ -121,22 +121,24 @@ final class Stemmer {
}
/** folds titlecase variant of word to titleBuffer */
- private void caseFoldTitle(char[] word, int length) {
+ char[] caseFoldTitle(char[] word, int length) {
titleBuffer = ArrayUtil.grow(titleBuffer, length);
System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) {
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
}
+ return titleBuffer;
}
/** folds lowercase variant of word (title cased) to lowerBuffer */
- private void caseFoldLower(char[] word, int length) {
+ char[] caseFoldLower(char[] word, int length) {
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
+ return lowerBuffer;
}
- private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
+ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
index 7d9e2e7..04adf7a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@@ -23,7 +23,7 @@ enum WordCase {
MIXED;
static WordCase caseOf(char[] word, int length) {
- boolean capitalized = Character.isUpperCase(word[0]);
+ boolean startsWithLower = Character.isLowerCase(word[0]);
boolean seenUpper = false;
boolean seenLower = false;
@@ -34,11 +34,11 @@ enum WordCase {
if (seenUpper && seenLower) break;
}
- return get(capitalized, seenUpper, seenLower);
+ return get(startsWithLower, seenUpper, seenLower);
}
static WordCase caseOf(CharSequence word, int length) {
- boolean capitalized = Character.isUpperCase(word.charAt(0));
+ boolean startsWithLower = Character.isLowerCase(word.charAt(0));
boolean seenUpper = false;
boolean seenLower = false;
@@ -49,11 +49,11 @@ enum WordCase {
if (seenUpper && seenLower) break;
}
- return get(capitalized, seenUpper, seenLower);
+ return get(startsWithLower, seenUpper, seenLower);
}
- private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
- if (capitalized) {
+ private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
+ if (!startsWithLower) {
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
}
return seenUpper ? MIXED : LOWER;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index a478dda..cfa1719 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("breakoff");
}
+ public void testCompoundrule() throws Exception {
+ doTest("compoundrule");
+ }
+
+ public void testCompoundrule2() throws Exception {
+ doTest("compoundrule2");
+ }
+
+ public void testCompoundrule3() throws Exception {
+ doTest("compoundrule3");
+ }
+
+ public void testCompoundrule4() throws Exception {
+ doTest("compoundrule4");
+ }
+
+ public void testCompoundrule5() throws Exception {
+ doTest("compoundrule5");
+ }
+
+ public void testCompoundrule6() throws Exception {
+ doTest("compoundrule6");
+ }
+
+ public void testCompoundrule7() throws Exception {
+ doTest("compoundrule7");
+ }
+
+ public void testCompoundrule8() throws Exception {
+ doTest("compoundrule8");
+ }
+
protected void doTest(String name) throws Exception {
InputStream affixStream =
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 5e8fdff..a0ece78 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
+import java.util.Random;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
@@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
+import org.junit.Test;
public class TestDictionary extends LuceneTestCase {
@@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase {
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8"));
}
+ @Test
+ public void testFlagSerialization() {
+ Random r = random();
+ char[] flags = new char[r.nextInt(10)];
+ for (int i = 0; i < flags.length; i++) {
+ flags[i] = (char) r.nextInt(Character.MAX_VALUE);
+ }
+
+ String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
+ for (String flagLine : flagLines) {
+ Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine);
+ StringBuilder serialized = new StringBuilder();
+ for (char flag : flags) {
+ strategy.appendFlag(flag, serialized);
+ }
+
+ char[] deserialized = strategy.parseFlags(serialized.toString());
+ assertEquals(new String(flags), new String(deserialized));
+ }
+ }
+
private Directory getDirectory() {
return newDirectory();
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
new file mode 100644
index 0000000..09309e0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
@@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE ABC
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
new file mode 100644
index 0000000..b11e829
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/BC
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
new file mode 100644
index 0000000..c7a0763
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
@@ -0,0 +1,2 @@
+abc
+acc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
new file mode 100644
index 0000000..bc151ea
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
@@ -0,0 +1,39 @@
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+aa
+aaa
+aaaa
+ab
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+ac
+aac
+aacc
+aaaccc
+bc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
new file mode 100644
index 0000000..e4b86a5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
@@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE A*B*C*
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
new file mode 100644
index 0000000..7d07bbc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
new file mode 100644
index 0000000..de743bb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
@@ -0,0 +1,37 @@
+aa
+aaa
+aaaa
+ab
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abc
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+ac
+acc
+aac
+aacc
+aaaccc
+bc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
+abcc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
new file mode 100644
index 0000000..9e5d38d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
@@ -0,0 +1,8 @@
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+cba
+cab
+acb
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
new file mode 100644
index 0000000..0053145
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
@@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE A?B?C?
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
new file mode 100644
index 0000000..7d07bbc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
new file mode 100644
index 0000000..7f51889
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
@@ -0,0 +1,7 @@
+a
+b
+c
+ab
+abc
+ac
+bc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
new file mode 100644
index 0000000..6bd1d80
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
@@ -0,0 +1,41 @@
+aa
+aaa
+aaaa
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+acc
+aac
+aacc
+aaaccc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
+abcc
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+cba
+cab
+acb
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
new file mode 100644
index 0000000..8a9996c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
@@ -0,0 +1,7 @@
+# English ordinal numbers
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND c
+COMPOUNDRULE 2
+COMPOUNDRULE n*1t
+COMPOUNDRULE n*mp
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
new file mode 100644
index 0000000..ced0735
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
@@ -0,0 +1,24 @@
+22
+0/nm
+1/n1
+2/nm
+3/nm
+4/nm
+5/nm
+6/nm
+7/nm
+8/nm
+9/nm
+0th/pt
+1st/p
+1th/tc
+2nd/p
+2th/tc
+3rd/p
+3th/tc
+4th/pt
+5th/pt
+6th/pt
+7th/pt
+8th/pt
+9th/pt
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
new file mode 100644
index 0000000..8694943
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
@@ -0,0 +1,31 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
+1ST
+42ND
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
new file mode 100644
index 0000000..99f28e7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
@@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
new file mode 100644
index 0000000..4650246
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
@@ -0,0 +1,7 @@
+# number + percent
+SET UTF-8
+COMPOUNDMIN 1
+COMPOUNDRULE 2
+COMPOUNDRULE N*%?
+COMPOUNDRULE NN*.NN*%?
+WORDCHARS 0123456789‰.
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
new file mode 100644
index 0000000..eeeffda
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
@@ -0,0 +1,14 @@
+13
+0/N po:num
+1/N po:num
+2/N po:num
+3/N po:num
+4/N po:num
+5/N po:num
+6/N po:num
+7/N po:num
+8/N po:num
+9/N po:num
+./. po:sign_dot
+%/% po:sign_percent
+‰/% po:sign_per_mille
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
new file mode 100644
index 0000000..691fca1
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
@@ -0,0 +1,7 @@
+10%
+0.2%
+0.20%
+123.4561‰
+10
+0000
+10.25
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
new file mode 100644
index 0000000..ba1fe32
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
@@ -0,0 +1 @@
+.25
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
new file mode 100644
index 0000000..e8a088d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
@@ -0,0 +1,4 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 2
+COMPOUNDRULE A*A
+COMPOUNDRULE A*AAB*BBBC*C
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
new file mode 100644
index 0000000..7d07bbc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
new file mode 100644
index 0000000..55a8f8b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
@@ -0,0 +1,4 @@
+aa
+aaaaaa
+aabbbc
+aaaaabbbbbbcccccc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
new file mode 100644
index 0000000..48b376d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
@@ -0,0 +1,4 @@
+abc
+abbbbbccccccc
+aabbccccccc
+aabbbbbbb
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
new file mode 100644
index 0000000..3ae1fc7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
@@ -0,0 +1,8 @@
+# English ordinal numbers (parenthesized long flags)
+FLAG long
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND cc
+COMPOUNDRULE 2
+COMPOUNDRULE (nn)*(11)(tt)
+COMPOUNDRULE (nn)*(mm)(pp)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
new file mode 100644
index 0000000..ad4bb4d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
@@ -0,0 +1,24 @@
+22
+0/nnmm
+1/nn11
+2/nnmm
+3/nnmm
+4/nnmm
+5/nnmm
+6/nnmm
+7/nnmm
+8/nnmm
+9/nnmm
+0th/pptt
+1st/pp
+1th/ttcc
+2nd/pp
+2th/ttcc
+3rd/pp
+3th/ttcc
+4th/pptt
+5th/pptt
+6th/pptt
+7th/pptt
+8th/pptt
+9th/pptt
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
new file mode 100644
index 0000000..fafe64a5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
@@ -0,0 +1,29 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
new file mode 100644
index 0000000..99f28e7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
@@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
new file mode 100644
index 0000000..03a423d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
@@ -0,0 +1,8 @@
+# English ordinal numbers (parenthesized numerical flags)
+FLAG num
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND 1000
+COMPOUNDRULE 2
+COMPOUNDRULE (1001)*(1002)(2001)
+COMPOUNDRULE (1001)*(2002)(2000)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
new file mode 100644
index 0000000..e156e95
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
@@ -0,0 +1,24 @@
+22
+0/1001,2002
+1/1001,1002
+2/1001,2002
+3/1001,2002
+4/1001,2002
+5/1001,2002
+6/1001,2002
+7/1001,2002
+8/1001,2002
+9/1001,2002
+0th/2000,2001
+1st/2000
+1th/2001,1000
+2nd/2000
+2th/2001,1000
+3rd/2000
+3th/2001,1000
+4th/2000,2001
+5th/2000,2001
+6th/2000,2001
+7th/2000,2001
+8th/2000,2001
+9th/2000,2001
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
new file mode 100644
index 0000000..fafe64a5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
@@ -0,0 +1,29 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
new file mode 100644
index 0000000..99f28e7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
@@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st