You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/02/22 10:35:58 UTC
[lucene-solr] branch master updated: LUCENE-9799: Hunspell: don't
check second-level affixes when the first level isn't a continuation
(#2413)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new f783848 LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation (#2413)
f783848 is described below
commit f783848e710f56f410906c235138842b5ac04b69
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Feb 22 11:35:36 2021 +0100
LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation (#2413)
* LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation
* check more words in TestPerformance
---
.../lucene/analysis/hunspell/Dictionary.java | 46 +++++++++++++++-------
.../apache/lucene/analysis/hunspell/Stemmer.java | 6 +--
.../lucene/analysis/hunspell/TestPerformance.java | 4 +-
3 files changed, 37 insertions(+), 19 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 5a23d76..a3cbe39 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -37,10 +37,10 @@ import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
-import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
@@ -78,9 +78,6 @@ public class Dictionary {
private static final int DEFAULT_FLAGS = 65510;
static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
- // TODO: really for suffixes we should reverse the automaton and run them backwards
- private static final String PREFIX_CONDITION_REGEX = "%s.*";
- private static final String SUFFIX_CONDITION_REGEX = ".*%s";
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
@@ -147,8 +144,12 @@ public class Dictionary {
boolean ignoreCase;
boolean checkSharpS;
boolean complexPrefixes;
- // if no affixes have continuation classes, no need to do 2-level affix stripping
- boolean twoStageAffix;
+
+ /**
+ * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
+ * need to do 2-level affix stripping with it.
+ */
+ private char[] secondStageAffixFlags;
char circumfix;
char keepcase, forceUCase;
@@ -332,6 +333,7 @@ public class Dictionary {
throws IOException, ParseException {
TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
+ Set<Character> stage2Flags = new HashSet<>();
Map<String, Integer> seenPatterns = new HashMap<>();
// zero condition -> 0 ord
@@ -359,9 +361,9 @@ public class Dictionary {
} else if ("AM".equals(firstWord)) {
parseMorphAlias(line);
} else if ("PFX".equals(firstWord)) {
- parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags);
+ parseAffix(prefixes, stage2Flags, line, reader, false, seenPatterns, seenStrips, flags);
} else if ("SFX".equals(firstWord)) {
- parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags);
+ parseAffix(suffixes, stage2Flags, line, reader, true, seenPatterns, seenStrips, flags);
} else if (line.equals("COMPLEXPREFIXES")) {
complexPrefixes =
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -476,6 +478,7 @@ public class Dictionary {
this.prefixes = affixFST(prefixes);
this.suffixes = affixFST(suffixes);
+ secondStageAffixFlags = toSortedCharArray(stage2Flags);
int totalChars = 0;
for (String strip : seenStrips.keySet()) {
@@ -675,16 +678,15 @@ public class Dictionary {
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
- * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate
- * the condition regex pattern
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(
TreeMap<String, List<Integer>> affixes,
+ Set<Character> secondStageFlags,
String header,
LineNumberReader reader,
- String conditionPattern,
+ boolean isSuffix,
Map<String, Integer> seenPatterns,
Map<String, Integer> seenStrips,
FlagEnumerator flags)
@@ -694,7 +696,6 @@ public class Dictionary {
String[] args = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
- boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX);
int numLines;
try {
@@ -725,7 +726,9 @@ public class Dictionary {
}
appendFlags = flagParsingStrategy.parseFlags(flagPart);
- twoStageAffix = true;
+ for (char appendFlag : appendFlags) {
+ secondStageFlags.add(appendFlag);
+ }
}
// zero affix -> empty string
if ("0".equals(affixArg)) {
@@ -750,7 +753,8 @@ public class Dictionary {
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
// but this is complicated...
} else {
- regex = String.format(Locale.ROOT, conditionPattern, condition);
+ // TODO: really for suffixes we should reverse the automaton and run them backwards
+ regex = isSuffix ? ".*" + condition : condition + ".*";
}
// deduplicate patterns
@@ -1610,6 +1614,20 @@ public class Dictionary {
return reuse;
}
+ private static char[] toSortedCharArray(Set<Character> set) {
+ char[] chars = new char[set.size()];
+ int i = 0;
+ for (Character c : set) {
+ chars[i++] = c;
+ }
+ Arrays.sort(chars);
+ return chars;
+ }
+
+ boolean isSecondStageAffix(char flag) {
+ return Arrays.binarySearch(secondStageAffixFlags, flag) >= 0;
+ }
+
/** folds single character (according to LANG if present) */
char caseFold(char c) {
if (alternateCasing) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 4164892..2a75f45 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -648,11 +648,11 @@ final class Stemmer {
if (recursionDepth == 0) {
if (prefix) {
prefixId = affix;
- doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix;
+ doPrefix = dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag);
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
- } else if (!dictionary.complexPrefixes && dictionary.twoStageAffix) {
+ } else if (!dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag)) {
doPrefix = false;
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
@@ -665,7 +665,7 @@ final class Stemmer {
if (prefix && dictionary.complexPrefixes) {
prefixId = affix;
// we took away the second prefix: go look for another suffix
- } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
+ } else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageAffix(flag)) {
return true;
}
// we took away a prefix, then a suffix: go look for another suffix
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 4cd54ba..bc9f71c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -66,7 +66,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void de() throws Exception {
- checkAnalysisPerformance("de", 200_000);
+ checkAnalysisPerformance("de", 300_000);
}
@Test
@@ -76,7 +76,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void fr() throws Exception {
- checkAnalysisPerformance("fr", 40_000);
+ checkAnalysisPerformance("fr", 80_000);
}
@Test