You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ct...@apache.org on 2021/01/15 21:45:42 UTC

[lucene-solr] 31/38: LUCENE-9665: Hunspell: support default encoding (#2203, Peter Gromov via Dawid Weiss)

This is an automated email from the ASF dual-hosted git repository.

ctargett pushed a commit to branch jira/solr-13105-toMerge
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 467b6772d10c673d6bb05ed97c39700fe1c74866
Author: Peter Gromov <gr...@gmail.com>
AuthorDate: Fri Jan 15 09:35:25 2021 +0100

    LUCENE-9665: Hunspell: support default encoding (#2203, Peter Gromov via Dawid Weiss)
---
 lucene/CHANGES.txt                                                | 2 ++
 .../src/java/org/apache/lucene/analysis/hunspell/Dictionary.java  | 8 ++++----
 .../test/org/apache/lucene/analysis/hunspell/TestDictionary.java  | 3 +++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 84240fb..8702d5f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -84,6 +84,8 @@ API Changes
 
 Improvements
 
+* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
+
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
   (Dawid Weiss)
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 2a6017f..a4b2f6c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -98,6 +98,7 @@ public class Dictionary {
   // TODO: really for suffixes we should reverse the automaton and run them backwards
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+  static final String DEFAULT_ENCODING = StandardCharsets.ISO_8859_1.name();
 
   FST<IntsRef> prefixes;
   FST<IntsRef> suffixes;
@@ -642,10 +643,8 @@ public class Dictionary {
    * @param affix InputStream for reading the affix file
    * @return Encoding specified in the affix file
    * @throws IOException Can be thrown while reading from the InputStream
-   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does
-   *     not adhere to the format {@code SET <encoding>}
    */
-  static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
+  static String getDictionaryEncoding(InputStream affix) throws IOException {
     final StringBuilder encoding = new StringBuilder();
     for (; ; ) {
       encoding.setLength(0);
@@ -664,7 +663,7 @@ public class Dictionary {
           // this test only at the end as ineffective but would allow lines only containing spaces:
           encoding.toString().trim().length() == 0) {
         if (ch < 0) {
-          throw new ParseException("Unexpected end of affix file.", 0);
+          return DEFAULT_ENCODING;
         }
         continue;
       }
@@ -673,6 +672,7 @@ public class Dictionary {
         int last = matcher.end();
         return encoding.substring(last).trim();
       }
+      return DEFAULT_ENCODING;
     }
   }
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 89d607f..34852cf 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -267,6 +267,9 @@ public class TestDictionary extends LuceneTestCase {
         "UTF-8",
         Dictionary.getDictionaryEncoding(
             new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
+    assertEquals(
+        Dictionary.DEFAULT_ENCODING,
+        Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
   }
 
   public void testFlagWithCrazyWhitespace() throws Exception {