You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/29 09:03:57 UTC
[lucene-solr] branch master updated: LUCENE-9704: Hunspell: support capitalization for German ß (#2260)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 6635d7a LUCENE-9704: Hunspell: support capitalization for German ß (#2260)
6635d7a is described below
commit 6635d7a5e7a6aee8f2347d756c20732acf7ca62a
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 29 10:03:37 2021 +0100
LUCENE-9704: Hunspell: support capitalization for German ß (#2260)
---
.../lucene/analysis/hunspell/Dictionary.java | 3 +
.../lucene/analysis/hunspell/SpellChecker.java | 42 +++++---
.../apache/lucene/analysis/hunspell/Stemmer.java | 114 +++++++++++++++++----
.../apache/lucene/analysis/hunspell/WordCase.java | 28 +++--
.../lucene/analysis/hunspell/SpellCheckerTest.java | 5 +
.../lucene/analysis/hunspell/TestCheckSharpS.java | 34 ++++++
.../lucene/analysis/hunspell/checksharps.aff | 4 +
.../lucene/analysis/hunspell/checksharps.dic | 7 ++
.../lucene/analysis/hunspell/checksharps.good | 13 +++
.../lucene/analysis/hunspell/checksharps.wrong | 2 +
10 files changed, 211 insertions(+), 41 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 6d7638b..f38ab59 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -133,6 +133,7 @@ public class Dictionary {
boolean hasStemExceptions;
boolean ignoreCase;
+ boolean checkSharpS;
boolean complexPrefixes;
// if no affixes have continuation classes, no need to do 2-level affix stripping
boolean twoStageAffix;
@@ -353,6 +354,8 @@ public class Dictionary {
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+ } else if ("CHECKSHARPS".equals(firstWord)) {
+ checkSharpS = true;
} else if ("IGNORE".equals(firstWord)) {
ignore = singleArgument(reader, line).toCharArray();
Arrays.sort(ignore);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 8570e3d..32c1ab2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -61,7 +61,7 @@ public class SpellChecker {
return false;
}
- if (checkWord(wordChars, wordChars.length, false)) {
+ if (checkWord(wordChars, wordChars.length, null)) {
return true;
}
@@ -89,23 +89,39 @@ public class SpellChecker {
char[] caseVariant = wordChars;
if (wordCase == WordCase.UPPER) {
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
- if (checkWord(caseVariant, wordChars.length, true)) {
+ if (checkWord(caseVariant, wordChars.length, wordCase)) {
return true;
}
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
- if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
+ if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
return true;
}
+ for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
+ if (checkWord(variation, variation.length, null)) {
+ return true;
+ }
+ }
}
- return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
+ char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
+ if (checkWord(lower, wordChars.length, wordCase)) {
+ return true;
+ }
+ if (wordCase == WordCase.UPPER) {
+ for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
+ if (checkWord(variation, variation.length, null)) {
+ return true;
+ }
+ }
+ }
+ return false;
}
- private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
+ private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
return false;
}
- if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
+ if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
return true;
}
@@ -114,16 +130,16 @@ public class SpellChecker {
return true;
}
- return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
+ return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
}
private boolean hasStems(
- char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
- return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
+ char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
+ return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
}
private boolean checkCompounds(
- char[] chars, int offset, int length, boolean caseVariant, int depth) {
+ char[] chars, int offset, int length, WordCase originalCase, int depth) {
if (depth > dictionary.compoundMax - 2) return false;
int limit = length - dictionary.compoundMin + 1;
@@ -131,13 +147,13 @@ public class SpellChecker {
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
int breakOffset = offset + breakPos;
if (checkCompoundCase(chars, breakOffset)
- && hasStems(chars, offset, breakPos, caseVariant, context)) {
+ && hasStems(chars, offset, breakPos, originalCase, context)) {
int remainingLength = length - breakPos;
- if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
+ if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
return true;
}
- if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
+ if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
return true;
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index d88ee40..bcda073 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -20,6 +20,8 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -99,20 +101,32 @@ final class Stemmer {
}
WordCase wordCase = caseOf(word, length);
- List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
+ List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
+ if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
+ addCaseVariations(word, length, wordCase, list);
+ }
+ return list;
+ }
+
+ private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length);
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null) {
- list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
+ list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
+ }
+ list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
+ for (char[] variation : sharpSVariations(titleBuffer, length)) {
+ list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
}
- list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
}
- if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
- caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
- list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
+ caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
+ list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
+ if (wordCase == WordCase.UPPER) {
+ for (char[] variation : sharpSVariations(lowerBuffer, length)) {
+ list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
+ }
}
- return list;
}
// temporary buffers for case variants
@@ -163,14 +177,52 @@ final class Stemmer {
return null;
}
+ List<char[]> sharpSVariations(char[] word, int length) {
+ if (!dictionary.checkSharpS) return Collections.emptyList();
+
+ Stream<String> result =
+ new Object() {
+ int findSS(int start) {
+ for (int i = start; i < length - 1; i++) {
+ if (word[i] == 's' && word[i + 1] == 's') {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ Stream<String> replaceSS(int start, int depth) {
+ if (depth > 5) { // cut off too large enumeration
+ return Stream.of(new String(word, start, length - start));
+ }
+
+ int ss = findSS(start);
+ if (ss < 0) {
+ return null;
+ } else {
+ String prefix = new String(word, start, ss - start);
+ Stream<String> tails = replaceSS(ss + 2, depth + 1);
+ if (tails == null) {
+ tails = Stream.of(new String(word, ss + 2, length - ss - 2));
+ }
+ return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
+ }
+ }
+ }.replaceSS(0, 0);
+ if (result == null) return Collections.emptyList();
+
+ String src = new String(word, 0, length);
+ return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
+ }
+
List<CharsRef> doStem(
- char[] word, int offset, int length, boolean caseVariant, WordContext context) {
+ char[] word, int offset, int length, WordCase originalCase, WordContext context) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
- if (!acceptCase(caseVariant, wordFlags)) {
+ if (!acceptCase(originalCase, wordFlags, word, offset, length)) {
continue;
}
// we can't add this form, it's a pseudostem requiring an affix
@@ -203,17 +255,35 @@ final class Stemmer {
true,
false,
false,
- caseVariant));
+ originalCase));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return stems;
}
- private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
- return caseVariant
- ? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
- : !Dictionary.hasHiddenFlag(wordFlags);
+ private boolean acceptCase(
+ WordCase originalCase, char[] wordFlags, char[] word, int offset, int length) {
+ boolean keepCase = Dictionary.hasFlag(wordFlags, dictionary.keepcase);
+ if (originalCase != null) {
+ if (keepCase
+ && dictionary.checkSharpS
+ && originalCase == WordCase.TITLE
+ && containsSharpS(word, offset, length)) {
+ return true;
+ }
+ return !keepCase;
+ }
+ return !Dictionary.hasHiddenFlag(wordFlags);
+ }
+
+ private boolean containsSharpS(char[] word, int offset, int length) {
+ for (int i = 0; i < length; i++) {
+ if (word[i + offset] == 'ß') {
+ return true;
+ }
+ }
+ return false;
}
/**
@@ -302,8 +372,8 @@ final class Stemmer {
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @param circumfix true if the previous prefix removal was signed as a circumfix this means inner
* most suffix must also contain circumfix flag.
- * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag
- * it cannot succeed.
+ * @param originalCase if non-null, represents original word case to disallow case variations of
+ * word with KEEPCASE flags
* @return List of stems, or empty list if no stems are found
*/
private List<CharsRef> stem(
@@ -319,7 +389,7 @@ final class Stemmer {
boolean doSuffix,
boolean previousWasPrefix,
boolean circumfix,
- boolean caseVariant)
+ WordCase originalCase)
throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
@@ -371,7 +441,7 @@ final class Stemmer {
recursionDepth,
true,
circumfix,
- caseVariant));
+ originalCase));
}
}
}
@@ -424,7 +494,7 @@ final class Stemmer {
recursionDepth,
false,
circumfix,
- caseVariant));
+ originalCase));
}
}
}
@@ -555,7 +625,7 @@ final class Stemmer {
int recursionDepth,
boolean prefix,
boolean circumfix,
- boolean caseVariant)
+ WordCase originalCase)
throws IOException {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
@@ -589,7 +659,7 @@ final class Stemmer {
}
// we are looking for a case variant, but this word does not allow it
- if (!acceptCase(caseVariant, wordFlags)) {
+ if (!acceptCase(originalCase, wordFlags, strippedWord, offset, length)) {
continue;
}
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
@@ -654,7 +724,7 @@ final class Stemmer {
true,
prefix,
circumfix,
- caseVariant));
+ originalCase));
}
return stems;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
index 04adf7a..01fffd9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@@ -28,9 +28,9 @@ enum WordCase {
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
- char ch = word[i];
- seenUpper = seenUpper || Character.isUpperCase(ch);
- seenLower = seenLower || Character.isLowerCase(ch);
+ CharCase cc = charCase(word[i]);
+ seenUpper = seenUpper || cc == CharCase.UPPER;
+ seenLower = seenLower || cc == CharCase.LOWER;
if (seenUpper && seenLower) break;
}
@@ -43,9 +43,9 @@ enum WordCase {
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
- char ch = word.charAt(i);
- seenUpper = seenUpper || Character.isUpperCase(ch);
- seenLower = seenLower || Character.isLowerCase(ch);
+ CharCase cc = charCase(word.charAt(i));
+ seenUpper = seenUpper || cc == CharCase.UPPER;
+ seenLower = seenLower || cc == CharCase.LOWER;
if (seenUpper && seenLower) break;
}
@@ -58,4 +58,20 @@ enum WordCase {
}
return seenUpper ? MIXED : LOWER;
}
+
+ private static CharCase charCase(char c) {
+ if (Character.isUpperCase(c)) {
+ return CharCase.UPPER;
+ }
+ if (Character.isLowerCase(c) && Character.toUpperCase(c) != c) {
+ return CharCase.LOWER;
+ }
+ return CharCase.NEUTRAL;
+ }
+
+ private enum CharCase {
+ UPPER,
+ LOWER,
+ NEUTRAL
+ }
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 0baf32f..a51a43b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -42,6 +42,11 @@ public class SpellCheckerTest extends StemmerTestBase {
}
@Test
+ public void checkSharpS() throws Exception {
+ doTest("checksharps");
+ }
+
+ @Test
public void IJ() throws Exception {
doTest("IJ");
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCheckSharpS.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCheckSharpS.java
new file mode 100644
index 0000000..8d20aa9
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCheckSharpS.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import org.junit.BeforeClass;
+
+public class TestCheckSharpS extends StemmerTestBase {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ init("checksharps.aff", "checksharps.dic");
+ }
+
+ public void testSharpS() {
+ assertStemsTo("Müßig", "müßig");
+ assertStemsTo("MÜSSIG", "müßig");
+ assertStemsTo("Müssig");
+ assertStemsTo("PROZESSIONSSTRASSE", "Prozessionsstraße");
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.aff
new file mode 100644
index 0000000..c1ff41f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.aff
@@ -0,0 +1,4 @@
+# test � - SS special capitalizing
+CHECKSHARPS
+WORDCHARS �.
+KEEPCASE k
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.dic
new file mode 100644
index 0000000..91d14ab
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.dic
@@ -0,0 +1,7 @@
+6
+m��ig/k
+Aussto�
+Absto�.
+Au�enabmessung
+Prozessionsstra�e
+Au�enma�e
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.good
new file mode 100644
index 0000000..a61c243
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.good
@@ -0,0 +1,13 @@
+müßig
+Müßig
+MÜSSIG
+Ausstoß
+Abstoß.
+Außenabmessung
+Prozessionsstraße
+Außenmaße
+AUSSTOSS
+ABSTOSS.
+AUSSENABMESSUNG
+PROZESSIONSSTRASSE
+AUSSENMASSE
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.wrong
new file mode 100644
index 0000000..72bdd9d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.wrong
@@ -0,0 +1,2 @@
+MÜßIG
+Müssig
\ No newline at end of file