You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2023/05/18 18:10:06 UTC
[lucene] branch branch_9x updated: GITHUB-12291: Skip blank lines from stopwords list. (#12299)
This is an automated email from the ASF dual-hosted git repository.
uschindler pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new d1db5583c8d GITHUB-12291: Skip blank lines from stopwords list. (#12299)
d1db5583c8d is described below
commit d1db5583c8dfc45843d69f9485ace6777e1a08a8
Author: Jerry Chin <me...@gmail.com>
AuthorDate: Thu May 18 22:58:32 2023 +0800
GITHUB-12291: Skip blank lines from stopwords list. (#12299)
---
lucene/CHANGES.txt | 3 ++-
.../apache/lucene/analysis/cn/smart/stopwords.txt | 2 --
.../org/apache/lucene/analysis/WordlistLoader.java | 24 ++++++++++++++--------
.../apache/lucene/analysis/TestWordlistLoader.java | 2 +-
4 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c97d0ade648..4d86f3e7f1c 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -35,7 +35,8 @@ Optimizations
Bug Fixes
---------------------
-(No changes)
+
+* GITHUB#12291: Skip blank lines from stopwords list. (Jerry Chin)
Other
---------------------
diff --git a/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt b/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt
index fb0d71ad7d2..65bcfd4e1b6 100644
--- a/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt
+++ b/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt
@@ -53,7 +53,5 @@ $
●
// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
-
//////////////// English Stop Words ////////////////
-
//////////////// Chinese Stop Words ////////////////
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
index 30ada92eb39..8e18f4ad76d 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -40,9 +40,9 @@ public class WordlistLoader {
private WordlistLoader() {}
/**
- * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting leading
- * and trailing whitespace). Every line of the Reader should contain only one word. The words need
- * to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like
+ * Reads lines from a Reader and adds every non-blank line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only one word. The
+ * words need to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param reader Reader containing the wordlist
@@ -53,7 +53,10 @@ public class WordlistLoader {
try (BufferedReader br = getBufferedReader(reader)) {
String word = null;
while ((word = br.readLine()) != null) {
- result.add(word.trim());
+ word = word.trim();
+ // skip blank lines
+ if (word.isEmpty()) continue;
+ result.add(word);
}
}
return result;
@@ -101,10 +104,10 @@ public class WordlistLoader {
}
/**
- * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet
- * (omitting leading and trailing whitespace). Every line of the Reader should contain only one
- * word. The words need to be in lowercase if you make use of an Analyzer which uses
- * LowerCaseFilter (like StandardAnalyzer).
+ * Reads lines from a Reader and adds every non-blank non-comment line as an entry to a
+ * CharArraySet (omitting leading and trailing whitespace). Every line of the Reader should
+ * contain only one word. The words need to be in lowercase if you make use of an Analyzer which
+ * uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
@@ -117,7 +120,10 @@ public class WordlistLoader {
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false) {
- result.add(word.trim());
+ word = word.trim();
+ // skip blank lines
+ if (word.isEmpty()) continue;
+ result.add(word);
}
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
index 7af64c0011e..4747c86834e 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
@@ -24,7 +24,7 @@ import org.apache.lucene.tests.util.LuceneTestCase;
public class TestWordlistLoader extends LuceneTestCase {
public void testWordlistLoading() throws IOException {
- String s = "ONE\n two \nthree";
+ String s = "ONE\n two \nthree\n\n";
CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
checkSet(wordSet1);
CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));