You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/08 20:37:00 UTC
[lucene-solr] branch master updated: LUCENE-9745: Hunspell: tolerate more aff/dic file typos (#2321)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new d0b4ef6  LUCENE-9745: Hunspell: tolerate more aff/dic file typos (#2321)
d0b4ef6 is described below

commit d0b4ef66d7fb4c9097e613949305c642a6090c90
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Feb 8 21:36:44 2021 +0100

    LUCENE-9745: Hunspell: tolerate more aff/dic file typos (#2321)
---
 .../lucene/analysis/hunspell/Dictionary.java       | 22 ++++++++++++++++------
 .../lucene/analysis/hunspell/TestDictionary.java   |  2 +-
 .../lucene/analysis/hunspell/forgivable-errors.aff |  8 +++++++-
 .../lucene/analysis/hunspell/forgivable-errors.dic |  2 ++
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 11cb1e0..5eb7457 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -391,9 +391,9 @@ public class Dictionary {
       } else if ("BREAK".equals(firstWord)) {
         breaks = parseBreaks(reader, line);
       } else if ("WORDCHARS".equals(firstWord)) {
-        wordChars = singleArgument(reader, line);
+        wordChars = firstArgument(reader, line);
       } else if ("TRY".equals(firstWord)) {
-        tryChars = singleArgument(reader, line);
+        tryChars = firstArgument(reader, line);
       } else if ("REP".equals(firstWord)) {
         int count = parseNum(reader, line);
         for (int i = 0; i < count; i++) {
@@ -469,7 +469,7 @@ public class Dictionary {
   }
 
   private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
-    String unparsed = singleArgument(reader, line);
+    String unparsed = firstArgument(reader, line);
     List<String> mapEntry = new ArrayList<>();
     for (int j = 0; j < unparsed.length(); j++) {
       if (unparsed.charAt(j) == '(') {
@@ -511,6 +511,10 @@ public class Dictionary {
     return splitBySpace(reader, line, 2)[1];
   }
 
+  private String firstArgument(LineNumberReader reader, String line) throws ParseException {
+    return splitBySpace(reader, line, 2, Integer.MAX_VALUE)[1];
+  }
+
   private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
       throws ParseException {
     return splitBySpace(reader, line, expectedParts, expectedParts);
@@ -615,7 +619,12 @@ public class Dictionary {
     boolean crossProduct = args[2].equals("Y");
     boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX);
 
-    int numLines = Integer.parseInt(args[3]);
+    int numLines;
+    try {
+      numLines = Integer.parseInt(args[3]);
+    } catch (NumberFormatException e) {
+      return;
+    }
     affixData = ArrayUtil.grow(affixData, currentAffix * 4 + numLines * 4);
 
     for (int i = 0; i < numLines; i++) {
@@ -905,14 +914,15 @@ public class Dictionary {
         || ch == MORPH_SEPARATOR; // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
   }
 
-  static int morphBoundary(String line) {
+  private static int morphBoundary(String line) {
     int end = indexOfSpaceOrTab(line, 0);
     if (end == -1) {
       return line.length();
     }
     while (end >= 0 && end < line.length()) {
       if (line.charAt(end) == '\t'
-          || end + 3 < line.length()
+          || end > 0
+              && end + 3 < line.length()
               && Character.isLetter(line.charAt(end + 1))
               && Character.isLetter(line.charAt(end + 2))
               && line.charAt(end + 3) == ':') {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index f2dcbe0..6ef783c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -108,7 +108,7 @@ public class TestDictionary extends LuceneTestCase {
   }
 
   public void testForgivableErrors() throws Exception {
-    Dictionary dictionary = loadDictionary("forgivable-errors.aff", "simple.dic");
+    Dictionary dictionary = loadDictionary("forgivable-errors.aff", "forgivable-errors.dic");
     assertEquals(1, dictionary.repTable.size());
     assertEquals(2, dictionary.compoundMax);
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff
index 8d17b4e..b9b56cc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff
@@ -2,8 +2,14 @@ REP 1
 REP foo bar goo doo zoo
 
 COMPOUNDWORDMAX 2 y
+WORDCHARS 0123456789'.-’ ̃
+TRY a b c
 
 KEEPCASE Aa
 
+MAP 1
+MAP a b
+
 SFX A Y 1
-SFX A   nout        l          [aeiouyáéíóúýůěr][^aeiouyáéíóúýůěrl][^aeiouy
\ No newline at end of file
+SFX A   nout        l          [aeiouyáéíóúýůěr][^aeiouyáéíóúýůěrl][^aeiouy
+SFX A b c d
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.dic
new file mode 100644
index 0000000..51a4bfb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.dic
@@ -0,0 +1,2 @@
+1
+ st:abc
\ No newline at end of file