You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/03/10 09:58:56 UTC
[lucene] 41/49: LUCENE-9665: Hunspell: support default encoding
(#2203, Peter Gromov via Dawid Weiss)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch jira/solr-13105-toMerge
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit 467b6772d10c673d6bb05ed97c39700fe1c74866
Author: Peter Gromov <gr...@gmail.com>
AuthorDate: Fri Jan 15 09:35:25 2021 +0100
LUCENE-9665: Hunspell: support default encoding (#2203, Peter Gromov via Dawid Weiss)
---
lucene/CHANGES.txt | 2 ++
.../src/java/org/apache/lucene/analysis/hunspell/Dictionary.java | 8 ++++----
.../test/org/apache/lucene/analysis/hunspell/TestDictionary.java | 3 +++
3 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 84240fb..8702d5f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -84,6 +84,8 @@ API Changes
Improvements
+* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
+
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 2a6017f..a4b2f6c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -98,6 +98,7 @@ public class Dictionary {
// TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+ static final String DEFAULT_ENCODING = StandardCharsets.ISO_8859_1.name();
FST<IntsRef> prefixes;
FST<IntsRef> suffixes;
@@ -642,10 +643,8 @@ public class Dictionary {
* @param affix InputStream for reading the affix file
* @return Encoding specified in the affix file
* @throws IOException Can be thrown while reading from the InputStream
- * @throws ParseException Thrown if the first non-empty non-comment line read from the file does
- * not adhere to the format {@code SET <encoding>}
*/
- static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
+ static String getDictionaryEncoding(InputStream affix) throws IOException {
final StringBuilder encoding = new StringBuilder();
for (; ; ) {
encoding.setLength(0);
@@ -664,7 +663,7 @@ public class Dictionary {
// this test only at the end as ineffective but would allow lines only containing spaces:
encoding.toString().trim().length() == 0) {
if (ch < 0) {
- throw new ParseException("Unexpected end of affix file.", 0);
+ return DEFAULT_ENCODING;
}
continue;
}
@@ -673,6 +672,7 @@ public class Dictionary {
int last = matcher.end();
return encoding.substring(last).trim();
}
+ return DEFAULT_ENCODING;
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 89d607f..34852cf 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -267,6 +267,9 @@ public class TestDictionary extends LuceneTestCase {
"UTF-8",
Dictionary.getDictionaryEncoding(
new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
+ assertEquals(
+ Dictionary.DEFAULT_ENCODING,
+ Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
}
public void testFlagWithCrazyWhitespace() throws Exception {