You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2022/01/03 14:09:21 UTC

[lucene] branch branch_9x updated: LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)

This is an automated email from the ASF dual-hosted git repository.

uschindler pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 837e163  LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)
837e163 is described below

commit 837e163eeeb1bd538fc196ceb7f10b0e72a2af56
Author: Uwe Schindler <us...@apache.org>
AuthorDate: Mon Jan 3 15:07:44 2022 +0100

    LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)
---
 lucene/CHANGES.txt                                 |  5 ++
 .../lucene/analysis/ja/JapaneseAnalyzer.java       | 28 +++++-----
 .../org/apache/lucene/analysis/WordlistLoader.java | 64 +++++++---------------
 3 files changed, 39 insertions(+), 58 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 634eb97..2c87919 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,6 +23,8 @@ API Changes
   are caller sensitive in Java 11). Instead add utility method IOUtils#requireResourceNonNull(T)
   to test existence of resource based on null return value.  (Uwe Schindler, Dawid Weiss)
 
+* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets.  (Uwe Schindler)
+
 New Features
 ---------------------
 
@@ -111,6 +113,9 @@ Bug Fixes
 
 * LUCENE-10279: Fix equals in MultiRangeQuery. (Ignacio Vera)
 
+* LUCENE-10349: Fix all analyzers to behave according to their documentation:
+  getDefaultStopSet() methods now return unmodifiable CharArraySets.  (Uwe Schindler)
+
 Other
 ---------------------
 
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
index 8c79a2d..9411902 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
@@ -20,8 +20,8 @@ import java.io.IOException;
 import java.io.Reader;
 import java.io.UncheckedIOException;
 import java.nio.charset.StandardCharsets;
-import java.util.HashSet;
 import java.util.Set;
+import java.util.stream.Collectors;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
@@ -80,24 +80,24 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
     static {
       try {
         DEFAULT_STOP_SET =
-            WordlistLoader.getWordSet(
-                IOUtils.getDecodingReader(
-                    IOUtils.requireResourceNonNull(
-                        JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
-                        "stopwords.txt"),
-                    StandardCharsets.UTF_8),
-                "#",
-                new CharArraySet(16, true)); // ignore case
+            CharArraySet.unmodifiableSet(
+                WordlistLoader.getWordSet(
+                    IOUtils.getDecodingReader(
+                        IOUtils.requireResourceNonNull(
+                            JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
+                            "stopwords.txt"),
+                        StandardCharsets.UTF_8),
+                    "#",
+                    new CharArraySet(16, true))); // ignore case
         final CharArraySet tagset =
             WordlistLoader.getWordSet(
                 IOUtils.requireResourceNonNull(
                     JapaneseAnalyzer.class.getResourceAsStream("stoptags.txt"), "stoptags.txt"),
                 "#");
-        DEFAULT_STOP_TAGS = new HashSet<>();
-        for (Object element : tagset) {
-          char[] chars = (char[]) element;
-          DEFAULT_STOP_TAGS.add(new String(chars));
-        }
+        DEFAULT_STOP_TAGS =
+            tagset.stream()
+                .map(ca -> new String((char[]) ca))
+                .collect(Collectors.toUnmodifiableSet());
       } catch (IOException ex) {
         // default set should always be present as it is part of the distribution (JAR)
         throw new UncheckedIOException("Unable to load default stopword or stoptag set", ex);
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
index 7437fc1..30ada92 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -50,15 +50,11 @@ public class WordlistLoader {
    * @return the given {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
-    BufferedReader br = null;
-    try {
-      br = getBufferedReader(reader);
+    try (BufferedReader br = getBufferedReader(reader)) {
       String word = null;
       while ((word = br.readLine()) != null) {
         result.add(word.trim());
       }
-    } finally {
-      IOUtils.close(br);
     }
     return result;
   }
@@ -70,10 +66,11 @@ public class WordlistLoader {
    * StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
-   * @return A {@link CharArraySet} with the reader's words
+   * @return An unmodifiable {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader) throws IOException {
-    return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+    return CharArraySet.unmodifiableSet(
+        getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
   }
 
   /**
@@ -83,7 +80,7 @@ public class WordlistLoader {
    * uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param stream InputStream containing the wordlist
-   * @return A {@link CharArraySet} with the reader's words
+   * @return An unmodifiable {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(InputStream stream) throws IOException {
     return getWordSet(stream, StandardCharsets.UTF_8);
@@ -97,7 +94,7 @@ public class WordlistLoader {
    *
    * @param stream InputStream containing the wordlist
    * @param charset Charset of the wordlist
-   * @return A {@link CharArraySet} with the reader's words
+   * @return An unmodifiable {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(InputStream stream, Charset charset) throws IOException {
     return getWordSet(IOUtils.getDecodingReader(stream, charset));
@@ -116,17 +113,13 @@ public class WordlistLoader {
    */
   public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result)
       throws IOException {
-    BufferedReader br = null;
-    try {
-      br = getBufferedReader(reader);
+    try (BufferedReader br = getBufferedReader(reader)) {
       String word = null;
       while ((word = br.readLine()) != null) {
         if (word.startsWith(comment) == false) {
           result.add(word.trim());
         }
       }
-    } finally {
-      IOUtils.close(br);
     }
     return result;
   }
@@ -139,10 +132,11 @@ public class WordlistLoader {
    *
    * @param reader Reader containing the wordlist
    * @param comment The string representing a comment.
-   * @return A CharArraySet with the reader's words
+   * @return An unmodifiable CharArraySet with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
-    return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
+    return CharArraySet.unmodifiableSet(
+        getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)));
   }
 
   /**
@@ -153,7 +147,7 @@ public class WordlistLoader {
    *
    * @param stream InputStream in UTF-8 encoding containing the wordlist
    * @param comment The string representing a comment.
-   * @return A CharArraySet with the reader's words
+   * @return An unmodifiable CharArraySet with the reader's words
    */
   public static CharArraySet getWordSet(InputStream stream, String comment) throws IOException {
     return getWordSet(stream, StandardCharsets.UTF_8, comment);
@@ -168,7 +162,7 @@ public class WordlistLoader {
    * @param stream InputStream containing the wordlist
    * @param charset Charset of the wordlist
    * @param comment The string representing a comment.
-   * @return A CharArraySet with the reader's words
+   * @return An unmodifiable CharArraySet with the reader's words
    */
   public static CharArraySet getWordSet(InputStream stream, Charset charset, String comment)
       throws IOException {
@@ -192,9 +186,7 @@ public class WordlistLoader {
    */
   public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
       throws IOException {
-    BufferedReader br = null;
-    try {
-      br = getBufferedReader(reader);
+    try (BufferedReader br = getBufferedReader(reader)) {
       String line = null;
       while ((line = br.readLine()) != null) {
         int comment = line.indexOf('|');
@@ -204,8 +196,6 @@ public class WordlistLoader {
           if (words[i].length() > 0) result.add(words[i]);
         }
       }
-    } finally {
-      IOUtils.close(br);
     }
     return result;
   }
@@ -222,10 +212,11 @@ public class WordlistLoader {
    * </ul>
    *
    * @param reader Reader containing a Snowball stopword list
-   * @return A {@link CharArraySet} with the reader's words
+   * @return An unmodifiable {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
-    return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+    return CharArraySet.unmodifiableSet(
+        getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
   }
 
   /**
@@ -240,7 +231,7 @@ public class WordlistLoader {
    * </ul>
    *
    * @param stream InputStream in UTF-8 encoding containing a Snowball stopword list
-   * @return A {@link CharArraySet} with the reader's words
+   * @return An unmodifiable {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getSnowballWordSet(InputStream stream) throws IOException {
     return getSnowballWordSet(stream, StandardCharsets.UTF_8);
@@ -259,7 +250,7 @@ public class WordlistLoader {
    *
    * @param stream InputStream containing a Snowball stopword list
    * @param charset Charset of the stopword list
-   * @return A {@link CharArraySet} with the reader's words
+   * @return An unmodifiable {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getSnowballWordSet(InputStream stream, Charset charset)
       throws IOException {
@@ -278,16 +269,12 @@ public class WordlistLoader {
    */
   public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result)
       throws IOException {
-    BufferedReader br = null;
-    try {
-      br = getBufferedReader(reader);
+    try (BufferedReader br = getBufferedReader(reader)) {
       String line;
       while ((line = br.readLine()) != null) {
         String[] wordstem = line.split("\t", 2);
         result.put(wordstem[0], wordstem[1]);
       }
-    } finally {
-      IOUtils.close(br);
     }
     return result;
   }
@@ -302,12 +289,8 @@ public class WordlistLoader {
    * @throws IOException If there is a low-level I/O error.
    */
   public static List<String> getLines(InputStream stream, Charset charset) throws IOException {
-    BufferedReader input = null;
     ArrayList<String> lines;
-    boolean success = false;
-    try {
-      input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
-
+    try (BufferedReader input = getBufferedReader(IOUtils.getDecodingReader(stream, charset))) {
       lines = new ArrayList<>();
       for (String word = null; (word = input.readLine()) != null; ) {
         // skip initial bom marker
@@ -320,14 +303,7 @@ public class WordlistLoader {
         if (word.length() == 0) continue;
         lines.add(word);
       }
-      success = true;
       return lines;
-    } finally {
-      if (success) {
-        IOUtils.close(input);
-      } else {
-        IOUtils.closeWhileHandlingException(input);
-      }
     }
   }