You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/08 20:37:00 UTC
[lucene-solr] branch master updated: LUCENE-9745: Hunspell:
tolerate more aff/dic file typos (#2321)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new d0b4ef6 LUCENE-9745: Hunspell: tolerate more aff/dic file typos (#2321)
d0b4ef6 is described below
commit d0b4ef66d7fb4c9097e613949305c642a6090c90
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Feb 8 21:36:44 2021 +0100
LUCENE-9745: Hunspell: tolerate more aff/dic file typos (#2321)
---
.../lucene/analysis/hunspell/Dictionary.java | 22 ++++++++++++++++------
.../lucene/analysis/hunspell/TestDictionary.java | 2 +-
.../lucene/analysis/hunspell/forgivable-errors.aff | 8 +++++++-
.../lucene/analysis/hunspell/forgivable-errors.dic | 2 ++
4 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 11cb1e0..5eb7457 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -391,9 +391,9 @@ public class Dictionary {
} else if ("BREAK".equals(firstWord)) {
breaks = parseBreaks(reader, line);
} else if ("WORDCHARS".equals(firstWord)) {
- wordChars = singleArgument(reader, line);
+ wordChars = firstArgument(reader, line);
} else if ("TRY".equals(firstWord)) {
- tryChars = singleArgument(reader, line);
+ tryChars = firstArgument(reader, line);
} else if ("REP".equals(firstWord)) {
int count = parseNum(reader, line);
for (int i = 0; i < count; i++) {
@@ -469,7 +469,7 @@ public class Dictionary {
}
private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
- String unparsed = singleArgument(reader, line);
+ String unparsed = firstArgument(reader, line);
List<String> mapEntry = new ArrayList<>();
for (int j = 0; j < unparsed.length(); j++) {
if (unparsed.charAt(j) == '(') {
@@ -511,6 +511,10 @@ public class Dictionary {
return splitBySpace(reader, line, 2)[1];
}
+ private String firstArgument(LineNumberReader reader, String line) throws ParseException {
+ return splitBySpace(reader, line, 2, Integer.MAX_VALUE)[1];
+ }
+
private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
throws ParseException {
return splitBySpace(reader, line, expectedParts, expectedParts);
@@ -615,7 +619,12 @@ public class Dictionary {
boolean crossProduct = args[2].equals("Y");
boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX);
- int numLines = Integer.parseInt(args[3]);
+ int numLines;
+ try {
+ numLines = Integer.parseInt(args[3]);
+ } catch (NumberFormatException e) {
+ return;
+ }
affixData = ArrayUtil.grow(affixData, currentAffix * 4 + numLines * 4);
for (int i = 0; i < numLines; i++) {
@@ -905,14 +914,15 @@ public class Dictionary {
|| ch == MORPH_SEPARATOR; // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
}
- static int morphBoundary(String line) {
+ private static int morphBoundary(String line) {
int end = indexOfSpaceOrTab(line, 0);
if (end == -1) {
return line.length();
}
while (end >= 0 && end < line.length()) {
if (line.charAt(end) == '\t'
- || end + 3 < line.length()
+ || end > 0
+ && end + 3 < line.length()
&& Character.isLetter(line.charAt(end + 1))
&& Character.isLetter(line.charAt(end + 2))
&& line.charAt(end + 3) == ':') {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index f2dcbe0..6ef783c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -108,7 +108,7 @@ public class TestDictionary extends LuceneTestCase {
}
public void testForgivableErrors() throws Exception {
- Dictionary dictionary = loadDictionary("forgivable-errors.aff", "simple.dic");
+ Dictionary dictionary = loadDictionary("forgivable-errors.aff", "forgivable-errors.dic");
assertEquals(1, dictionary.repTable.size());
assertEquals(2, dictionary.compoundMax);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff
index 8d17b4e..b9b56cc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff
@@ -2,8 +2,14 @@ REP 1
REP foo bar goo doo zoo
COMPOUNDWORDMAX 2 y
+WORDCHARS 0123456789'.-’ ̃
+TRY a b c
KEEPCASE Aa
+MAP 1
+MAP a b
+
SFX A Y 1
-SFX A nout l [aeiouyáéíóúýůěr][^aeiouyáéíóúýůěrl][^aeiouy
\ No newline at end of file
+SFX A nout l [aeiouyáéíóúýůěr][^aeiouyáéíóúýůěrl][^aeiouy
+SFX A b c d
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.dic
new file mode 100644
index 0000000..51a4bfb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.dic
@@ -0,0 +1,2 @@
+1
+ st:abc
\ No newline at end of file