You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/10/21 14:34:24 UTC
[lucene-solr] branch branch_8x updated: LUCENE-10008: Respect
ignoreCase flag in CommonGramsFilterFactory (#2573)
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 641ac0b LUCENE-10008: Respect ignoreCase flag in CommonGramsFilterFactory (#2573)
641ac0b is described below
commit 641ac0b36a9257db3a6d2f9d12f422cfe5fddbc3
Author: Vigya Sharma <vi...@gmail.com>
AuthorDate: Thu Oct 21 07:34:04 2021 -0700
LUCENE-10008: Respect ignoreCase flag in CommonGramsFilterFactory (#2573)
---
lucene/CHANGES.txt | 2 +
.../commongrams/CommonGramsFilterFactory.java | 48 ++-------
.../lucene/analysis/core/StopFilterFactory.java | 52 ++-------
.../en/AbstractWordsFileFilterFactory.java | 118 +++++++++++++++++++++
.../miscellaneous/KeepWordFilterFactory.java | 36 ++-----
.../commongrams/TestCommonGramsFilterFactory.java | 72 +++++++++----
.../lucene/analysis/commongrams/common-1.txt | 17 +++
.../lucene/analysis/commongrams/common-2.txt | 17 +++
.../analysis/commongrams/common-snowball.txt | 10 ++
.../miscellaneous/TestKeepFilterFactory.java | 27 +++++
.../analysis/miscellaneous/keep-snowball.txt | 10 ++
11 files changed, 280 insertions(+), 129 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 291d080..be50560 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -39,6 +39,8 @@ Bug Fixes
* LUCENE-10134: ConcurrentSortedSetDocValuesFacetCounts shouldn't share liveDocs Bits across threads.
(Ankur Goel)
+* LUCENE-10008: Respect ignoreCase in CommonGramsFilterFactory (Vigya Sharma)
+
Build
---------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index 523601b..c42d88f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -16,17 +16,13 @@
*/
package org.apache.lucene.analysis.commongrams;
-
-import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Constructs a {@link CommonGramsFilter}.
@@ -41,55 +37,29 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
-public class CommonGramsFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory {
/** SPI name */
public static final String NAME = "commonGrams";
- // TODO: shared base class for Stop/Keep/CommonGrams?
- private CharArraySet commonWords;
- private final String commonWordFiles;
- private final String format;
- private final boolean ignoreCase;
-
/** Creates a new CommonGramsFilterFactory */
public CommonGramsFilterFactory(Map<String,String> args) {
super(args);
- commonWordFiles = get(args, "words");
- format = get(args, "format");
- ignoreCase = getBoolean(args, "ignoreCase", false);
- if (!args.isEmpty()) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
- }
-
- @Override
- public void inform(ResourceLoader loader) throws IOException {
- if (commonWordFiles != null) {
- if ("snowball".equalsIgnoreCase(format)) {
- commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
- } else {
- commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
- }
- } else {
- commonWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
- }
}
- public boolean isIgnoreCase() {
- return ignoreCase;
+ public CharArraySet getCommonWords() {
+ return getWords();
}
- public CharArraySet getCommonWords() {
- return commonWords;
+ @Override
+ protected CharArraySet createDefaultWords() {
+ return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
}
@Override
public TokenFilter create(TokenStream input) {
- CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
+ CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords());
return commonGrams;
}
}
-
-
-
+
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
index a2b5b80..b6550f3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
@@ -16,17 +16,13 @@
*/
package org.apache.lucene.analysis.core;
-
-import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link StopFilter}.
@@ -73,59 +69,29 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
-public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class StopFilterFactory extends AbstractWordsFileFilterFactory {
/** SPI name */
public static final String NAME = "stop";
- public static final String FORMAT_WORDSET = "wordset";
- public static final String FORMAT_SNOWBALL = "snowball";
-
- private CharArraySet stopWords;
- private final String stopWordFiles;
- private final String format;
- private final boolean ignoreCase;
-
/** Creates a new StopFilterFactory */
public StopFilterFactory(Map<String,String> args) {
super(args);
- stopWordFiles = get(args, "words");
- format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
- ignoreCase = getBoolean(args, "ignoreCase", false);
- if (!args.isEmpty()) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
- }
-
- @Override
- public void inform(ResourceLoader loader) throws IOException {
- if (stopWordFiles != null) {
- if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
- stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
- } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
- stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
- } else {
- throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format);
- }
- } else {
- if (null != format) {
- throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
- }
- stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
- }
}
- public boolean isIgnoreCase() {
- return ignoreCase;
+ public CharArraySet getStopWords() {
+ return getWords();
}
- public CharArraySet getStopWords() {
- return stopWords;
+ @Override
+ protected CharArraySet createDefaultWords() {
+ return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
}
@Override
public TokenStream create(TokenStream input) {
- StopFilter stopFilter = new StopFilter(input,stopWords);
+ StopFilter stopFilter = new StopFilter(input, getWords());
return stopFilter;
}
}
+
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
new file mode 100644
index 0000000..b74b70f
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.en;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+
+/**
+ * Abstract parent class for analysis factories that accept a stopwords file as input.
+ *
+ * <p>Concrete implementations can leverage the following input attributes. All attributes are
+ * optional:
+ *
+ * <ul>
+ * <li><code>ignoreCase</code> defaults to <code>false</code>
+ * <li><code>words</code> should be the name of a stopwords file to parse, if not specified the
+ * factory will use the value provided by {@link #createDefaultWords()} implementation in
+ * concrete subclass.
+ * <li><code>format</code> defines how the <code>words</code> file will be parsed, and defaults to
+ * <code>wordset</code>. If <code>words</code> is not specified, then <code>format</code> must
+ * not be specified.
+ * </ul>
+ *
+ * <p>The valid values for the <code>format</code> option are:
+ *
+ * <ul>
+ * <li><code>wordset</code> - This is the default format, which supports one word per line
+ * (including any intra-word whitespace) and allows whole line comments beginning with the "#"
+ * character. Blank lines are ignored. See {@link WordlistLoader#getLines
+ * WordlistLoader.getLines} for details.
+ * <li><code>snowball</code> - This format allows for multiple words specified on each line, and
+ * trailing comments may be specified using the vertical line ("|"). Blank lines are
+ * ignored. See {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
+ * for details.
+ * </ul>
+ */
+public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory
+ implements ResourceLoaderAware {
+
+ public static final String FORMAT_WORDSET = "wordset";
+ public static final String FORMAT_SNOWBALL = "snowball";
+
+ private CharArraySet words;
+ private final String wordFiles;
+ private final String format;
+ private final boolean ignoreCase;
+
+ /** Initialize this factory via a set of key-value pairs. */
+ public AbstractWordsFileFilterFactory(Map<String, String> args) {
+ super(args);
+ wordFiles = get(args, "words");
+ format = get(args, "format", (null == wordFiles ? null : FORMAT_WORDSET));
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /** Initialize the set of stopwords provided via ResourceLoader, or using defaults. */
+ @Override
+ public void inform(ResourceLoader loader) throws IOException {
+ if (wordFiles != null) {
+ if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
+ words = getWordSet(loader, wordFiles, ignoreCase);
+ } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
+ words = getSnowballWordSet(loader, wordFiles, ignoreCase);
+ } else {
+ throw new IllegalArgumentException(
+ "Unknown 'format' specified for 'words' file: " + format);
+ }
+ } else {
+ if (null != format) {
+ throw new IllegalArgumentException(
+ "'format' can not be specified w/o an explicit 'words' file: " + format);
+ }
+ words = createDefaultWords();
+ }
+ }
+
+ /** Default word set implementation. */
+ protected abstract CharArraySet createDefaultWords();
+
+ public CharArraySet getWords() {
+ return words;
+ }
+
+ public String getWordFiles() {
+ return wordFiles;
+ }
+
+ public String getFormat() {
+ return format;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+}
+
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
index 64a83ca..69200c6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
@@ -16,15 +16,11 @@
*/
package org.apache.lucene.analysis.miscellaneous;
-
-import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
/**
* Factory for {@link KeepWordFilter}.
@@ -39,48 +35,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
-public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory {
/** SPI name */
public static final String NAME = "keepWord";
- private final boolean ignoreCase;
- private final String wordFiles;
- private CharArraySet words;
-
/** Creates a new KeepWordFilterFactory */
public KeepWordFilterFactory(Map<String,String> args) {
super(args);
- wordFiles = get(args, "words");
- ignoreCase = getBoolean(args, "ignoreCase", false);
- if (!args.isEmpty()) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
}
@Override
- public void inform(ResourceLoader loader) throws IOException {
- if (wordFiles != null) {
- words = getWordSet(loader, wordFiles, ignoreCase);
- }
- }
-
- public boolean isIgnoreCase() {
- return ignoreCase;
- }
-
- public CharArraySet getWords() {
- return words;
+ protected CharArraySet createDefaultWords() {
+ return null;
}
@Override
public TokenStream create(TokenStream input) {
// if the set is null, it means it was empty
- if (words == null) {
+ if (getWords() == null) {
return input;
} else {
- final TokenStream filter = new KeepWordFilter(input, words);
+ final TokenStream filter = new KeepWordFilter(input, getWords());
return filter;
}
}
}
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index 5bcfb3d..9fa81f4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -23,25 +23,26 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.TestStopFilterFactory;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.Version;
-/**
- * Tests pretty much copied from StopFilterFactoryTest We use the test files
- * used by the StopFilterFactoryTest TODO: consider creating separate test files
- * so this won't break if stop filter test files change
- **/
public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testInform() throws Exception {
- ResourceLoader loader = new ClasspathResourceLoader(TestStopFilterFactory.class);
+ ResourceLoader loader = new ClasspathResourceLoader(getClass());
assertTrue("loader is null and it shouldn't be", loader != null);
- CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", Version.LATEST, loader,
- "words", "stop-1.txt",
- "ignoreCase", "true");
+ CommonGramsFilterFactory factory =
+ (CommonGramsFilterFactory)
+ tokenFilterFactory(
+ "CommonGrams",
+ Version.LATEST,
+ loader,
+ "words",
+ "common-1.txt",
+ "ignoreCase",
+ "true");
CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2,
@@ -49,9 +50,16 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
- factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", Version.LATEST, loader,
- "words", "stop-1.txt, stop-2.txt",
- "ignoreCase", "true");
+ factory =
+ (CommonGramsFilterFactory)
+ tokenFilterFactory(
+ "CommonGrams",
+ Version.LATEST,
+ loader,
+ "words",
+ "common-1.txt, common-2.txt",
+ "ignoreCase",
+ "true");
words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4,
@@ -59,10 +67,18 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
- factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", Version.LATEST, loader,
- "words", "stop-snowball.txt",
- "format", "snowball",
- "ignoreCase", "true");
+ factory =
+ (CommonGramsFilterFactory)
+ tokenFilterFactory(
+ "CommonGrams",
+ Version.LATEST,
+ loader,
+ "words",
+ "common-snowball.txt",
+ "format",
+ "snowball",
+ "ignoreCase",
+ "true");
words = factory.getCommonWords();
assertEquals(8, words.size());
assertTrue(words.contains("he"));
@@ -89,7 +105,26 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
assertTokenStreamContents(stream,
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
}
-
+
+ /**
+ * Test that ignoreCase flag is honored when no words are provided and default stopwords are used.
+ */
+ public void testIgnoreCase() throws Exception {
+ ResourceLoader loader = new ClasspathResourceLoader(getClass());
+ CommonGramsFilterFactory factory =
+ (CommonGramsFilterFactory)
+ tokenFilterFactory("CommonGrams", Version.LATEST, loader, "ignoreCase", "true");
+ CharArraySet words = factory.getCommonWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue(words.contains("the"));
+ assertTrue(words.contains("The"));
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ tokenizer.setReader(new StringReader("testing The factory"));
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(
+ stream, new String[] {"testing", "testing_The", "The", "The_factory", "factory"});
+ }
+
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
@@ -98,3 +133,4 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt
new file mode 100644
index 0000000..8dfe809
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+foo
+bar
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt
new file mode 100644
index 0000000..646b7ff
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+junk
+more
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt
new file mode 100644
index 0000000..1c0c6f5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt
@@ -0,0 +1,10 @@
+ | This is a file in snowball format, empty lines are ignored, '|' is a comment
+ | Additionally, multiple words can be on the same line, allowing stopwords to be
+ | arranged in tables (useful in some languages where they might inflect)
+
+ | fictitious table below
+
+|third person singular
+|Subject Object Possessive Reflexive
+he him his himself| masculine
+she her hers herself| feminine
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
index dde6f94..ea0de1c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
@@ -40,6 +40,32 @@ public class TestKeepFilterFactory extends BaseTokenStreamFactoryTestCase {
words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
+
+ factory =
+ (KeepWordFilterFactory)
+ tokenFilterFactory(
+ "KeepWord",
+ "words",
+ "keep-snowball.txt",
+ "format",
+ "snowball",
+ "ignoreCase",
+ "true");
+ words = factory.getWords();
+ assertEquals(8, words.size());
+ assertTrue(words.contains("he"));
+ assertTrue(words.contains("him"));
+ assertTrue(words.contains("his"));
+ assertTrue(words.contains("himself"));
+ assertTrue(words.contains("she"));
+ assertTrue(words.contains("her"));
+ assertTrue(words.contains("hers"));
+ assertTrue(words.contains("herself"));
+
+ // defaults
+ factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord");
+ assertTrue(factory.getWords() == null);
+ assertEquals(false, factory.isIgnoreCase());
}
/** Test that bogus arguments result in exception */
@@ -50,3 +76,4 @@ public class TestKeepFilterFactory extends BaseTokenStreamFactoryTestCase {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt
new file mode 100644
index 0000000..1c0c6f5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt
@@ -0,0 +1,10 @@
+ | This is a file in snowball format, empty lines are ignored, '|' is a comment
+ | Additionally, multiple words can be on the same line, allowing stopwords to be
+ | arranged in tables (useful in some languages where they might inflect)
+
+ | fictitious table below
+
+|third person singular
+|Subject Object Possessive Reflexive
+he him his himself| masculine
+she her hers herself| feminine