You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2022/01/03 14:09:21 UTC
[lucene] branch branch_9x updated: LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)
This is an automated email from the ASF dual-hosted git repository.
uschindler pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 837e163 LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)
837e163 is described below
commit 837e163eeeb1bd538fc196ceb7f10b0e72a2af56
Author: Uwe Schindler <us...@apache.org>
AuthorDate: Mon Jan 3 15:07:44 2022 +0100
LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)
---
lucene/CHANGES.txt | 5 ++
.../lucene/analysis/ja/JapaneseAnalyzer.java | 28 +++++-----
.../org/apache/lucene/analysis/WordlistLoader.java | 64 +++++++---------------
3 files changed, 39 insertions(+), 58 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 634eb97..2c87919 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,6 +23,8 @@ API Changes
are caller sensitive in Java 11). Instead add utility method IOUtils#requireResourceNonNull(T)
to test existence of resource based on null return value. (Uwe Schindler, Dawid Weiss)
+* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler)
+
New Features
---------------------
@@ -111,6 +113,9 @@ Bug Fixes
* LUCENE-10279: Fix equals in MultiRangeQuery. (Ignacio Vera)
+* LUCENE-10349: Fix all analyzers to behave according to their documentation:
+ getDefaultStopSet() methods now return unmodifiable CharArraySets. (Uwe Schindler)
+
Other
---------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
index 8c79a2d..9411902 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
@@ -20,8 +20,8 @@ import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
-import java.util.HashSet;
import java.util.Set;
+import java.util.stream.Collectors;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
@@ -80,24 +80,24 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET =
- WordlistLoader.getWordSet(
- IOUtils.getDecodingReader(
- IOUtils.requireResourceNonNull(
- JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
- "stopwords.txt"),
- StandardCharsets.UTF_8),
- "#",
- new CharArraySet(16, true)); // ignore case
+ CharArraySet.unmodifiableSet(
+ WordlistLoader.getWordSet(
+ IOUtils.getDecodingReader(
+ IOUtils.requireResourceNonNull(
+ JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
+ "stopwords.txt"),
+ StandardCharsets.UTF_8),
+ "#",
+ new CharArraySet(16, true))); // ignore case
final CharArraySet tagset =
WordlistLoader.getWordSet(
IOUtils.requireResourceNonNull(
JapaneseAnalyzer.class.getResourceAsStream("stoptags.txt"), "stoptags.txt"),
"#");
- DEFAULT_STOP_TAGS = new HashSet<>();
- for (Object element : tagset) {
- char[] chars = (char[]) element;
- DEFAULT_STOP_TAGS.add(new String(chars));
- }
+ DEFAULT_STOP_TAGS =
+ tagset.stream()
+ .map(ca -> new String((char[]) ca))
+ .collect(Collectors.toUnmodifiableSet());
} catch (IOException ex) {
// default set should always be present as it is part of the distribution (JAR)
throw new UncheckedIOException("Unable to load default stopword or stoptag set", ex);
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
index 7437fc1..30ada92 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -50,15 +50,11 @@ public class WordlistLoader {
* @return the given {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
+ try (BufferedReader br = getBufferedReader(reader)) {
String word = null;
while ((word = br.readLine()) != null) {
result.add(word.trim());
}
- } finally {
- IOUtils.close(br);
}
return result;
}
@@ -70,10 +66,11 @@ public class WordlistLoader {
* StandardAnalyzer).
*
* @param reader Reader containing the wordlist
- * @return A {@link CharArraySet} with the reader's words
+ * @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader) throws IOException {
- return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+ return CharArraySet.unmodifiableSet(
+ getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
}
/**
@@ -83,7 +80,7 @@ public class WordlistLoader {
* uses LowerCaseFilter (like StandardAnalyzer).
*
* @param stream InputStream containing the wordlist
- * @return A {@link CharArraySet} with the reader's words
+ * @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream) throws IOException {
return getWordSet(stream, StandardCharsets.UTF_8);
@@ -97,7 +94,7 @@ public class WordlistLoader {
*
* @param stream InputStream containing the wordlist
* @param charset Charset of the wordlist
- * @return A {@link CharArraySet} with the reader's words
+ * @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream, Charset charset) throws IOException {
return getWordSet(IOUtils.getDecodingReader(stream, charset));
@@ -116,17 +113,13 @@ public class WordlistLoader {
*/
public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result)
throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
+ try (BufferedReader br = getBufferedReader(reader)) {
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false) {
result.add(word.trim());
}
}
- } finally {
- IOUtils.close(br);
}
return result;
}
@@ -139,10 +132,11 @@ public class WordlistLoader {
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
- * @return A CharArraySet with the reader's words
+ * @return An unmodifiable CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
- return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
+ return CharArraySet.unmodifiableSet(
+ getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)));
}
/**
@@ -153,7 +147,7 @@ public class WordlistLoader {
*
* @param stream InputStream in UTF-8 encoding containing the wordlist
* @param comment The string representing a comment.
- * @return A CharArraySet with the reader's words
+ * @return An unmodifiable CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream, String comment) throws IOException {
return getWordSet(stream, StandardCharsets.UTF_8, comment);
@@ -168,7 +162,7 @@ public class WordlistLoader {
* @param stream InputStream containing the wordlist
* @param charset Charset of the wordlist
* @param comment The string representing a comment.
- * @return A CharArraySet with the reader's words
+ * @return An unmodifiable CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream, Charset charset, String comment)
throws IOException {
@@ -192,9 +186,7 @@ public class WordlistLoader {
*/
public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
+ try (BufferedReader br = getBufferedReader(reader)) {
String line = null;
while ((line = br.readLine()) != null) {
int comment = line.indexOf('|');
@@ -204,8 +196,6 @@ public class WordlistLoader {
if (words[i].length() > 0) result.add(words[i]);
}
}
- } finally {
- IOUtils.close(br);
}
return result;
}
@@ -222,10 +212,11 @@ public class WordlistLoader {
* </ul>
*
* @param reader Reader containing a Snowball stopword list
- * @return A {@link CharArraySet} with the reader's words
+ * @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
- return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+ return CharArraySet.unmodifiableSet(
+ getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
}
/**
@@ -240,7 +231,7 @@ public class WordlistLoader {
* </ul>
*
* @param stream InputStream in UTF-8 encoding containing a Snowball stopword list
- * @return A {@link CharArraySet} with the reader's words
+ * @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(InputStream stream) throws IOException {
return getSnowballWordSet(stream, StandardCharsets.UTF_8);
@@ -259,7 +250,7 @@ public class WordlistLoader {
*
* @param stream InputStream containing a Snowball stopword list
* @param charset Charset of the stopword list
- * @return A {@link CharArraySet} with the reader's words
+ * @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(InputStream stream, Charset charset)
throws IOException {
@@ -278,16 +269,12 @@ public class WordlistLoader {
*/
public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result)
throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
+ try (BufferedReader br = getBufferedReader(reader)) {
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
- } finally {
- IOUtils.close(br);
}
return result;
}
@@ -302,12 +289,8 @@ public class WordlistLoader {
* @throws IOException If there is a low-level I/O error.
*/
public static List<String> getLines(InputStream stream, Charset charset) throws IOException {
- BufferedReader input = null;
ArrayList<String> lines;
- boolean success = false;
- try {
- input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
-
+ try (BufferedReader input = getBufferedReader(IOUtils.getDecodingReader(stream, charset))) {
lines = new ArrayList<>();
for (String word = null; (word = input.readLine()) != null; ) {
// skip initial bom marker
@@ -320,14 +303,7 @@ public class WordlistLoader {
if (word.length() == 0) continue;
lines.add(word);
}
- success = true;
return lines;
- } finally {
- if (success) {
- IOUtils.close(input);
- } else {
- IOUtils.closeWhileHandlingException(input);
- }
}
}