You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/05 20:54:19 UTC
svn commit: r1240791 - in /lucene/dev/branches/branch_3x: ./ solr/
solr/core/ solr/core/src/java/org/apache/solr/analysis/
solr/core/src/test-files/solr/conf/
solr/core/src/test/org/apache/solr/analysis/
Author: rmuir
Date: Sun Feb 5 19:54:18 2012
New Revision: 1240791
URL: http://svn.apache.org/viewvc?rev=1240791&view=rev
Log:
SOLR-1860: support snowball format in stoplists
Added:
lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/stop-snowball.txt
- copied unchanged from r1240784, lucene/dev/trunk/solr/core/src/test-files/solr/conf/stop-snowball.txt
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/core/ (props changed)
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Sun Feb 5 19:54:18 2012
@@ -84,6 +84,10 @@ New Features
* LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
(Christian Moen, Masaru Hasegawa via Robert Muir)
+* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and
+ CommonGramsQueryFilterFactory can optionally read stopwords in Snowball
+ format (specify format="snowball"). (Robert Muir)
+
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java Sun Feb 5 19:54:18 2012
@@ -23,11 +23,18 @@ import org.apache.solr.core.Config;
import org.apache.solr.schema.IndexSchema;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
@@ -129,4 +136,34 @@ abstract class BaseTokenStreamFactory {
}
return words;
}
+
+ /** same as {@link #getWordSet(ResourceLoader, String, boolean)},
+ * except the input is in snowball format. */
+ protected CharArraySet getSnowballWordSet(ResourceLoader loader,
+ String wordFiles, boolean ignoreCase) throws IOException {
+ assureMatchVersion();
+ List<String> files = StrUtils.splitFileNames(wordFiles);
+ CharArraySet words = null;
+ if (files.size() > 0) {
+ // default stopwords list has 35 or so words, but maybe don't make it that
+ // big to start
+ words = new CharArraySet(luceneMatchVersion,
+ files.size() * 10, ignoreCase);
+ for (String file : files) {
+ InputStream stream = null;
+ Reader reader = null;
+ try {
+ stream = loader.openResource(file.trim());
+ CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ reader = new InputStreamReader(stream, decoder);
+ WordlistLoader.getSnowballWordSet(reader, words);
+ } finally {
+ IOUtils.closeWhileHandlingException(reader, stream);
+ }
+ }
+ }
+ return words;
+ }
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java Sun Feb 5 19:54:18 2012
@@ -49,7 +49,11 @@ public class CommonGramsFilterFactory ex
if (commonWordFiles != null) {
try {
- commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
+ if ("snowball".equalsIgnoreCase(args.get("format"))) {
+ commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
+ } else {
+ commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
+ }
} catch (IOException e) {
throw new RuntimeException(e);
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java Sun Feb 5 19:54:18 2012
@@ -55,7 +55,11 @@ public class CommonGramsQueryFilterFacto
if (commonWordFiles != null) {
try {
- commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
+ if ("snowball".equalsIgnoreCase(args.get("format"))) {
+ commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
+ } else {
+ commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
+ }
} catch (IOException e) {
throw new RuntimeException(e);
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java Sun Feb 5 19:54:18 2012
@@ -55,7 +55,11 @@ public class StopFilterFactory extends B
if (stopWordFiles != null) {
try {
- stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
+ if ("snowball".equalsIgnoreCase(args.get("format"))) {
+ stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
+ } else {
+ stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
+ }
} catch (IOException e) {
throw new RuntimeException(e);
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java Sun Feb 5 19:54:18 2012
@@ -62,6 +62,21 @@ public class CommonGramsFilterFactoryTes
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
+ factory = new CommonGramsFilterFactory();
+ args.put("words", "stop-snowball.txt");
+ args.put("format", "snowball");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getCommonWords();
+ assertEquals(8, words.size());
+ assertTrue(words.contains("he"));
+ assertTrue(words.contains("him"));
+ assertTrue(words.contains("his"));
+ assertTrue(words.contains("himself"));
+ assertTrue(words.contains("she"));
+ assertTrue(words.contains("her"));
+ assertTrue(words.contains("hers"));
+ assertTrue(words.contains("herself"));
}
/**
Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java Sun Feb 5 19:54:18 2012
@@ -61,6 +61,21 @@ public class CommonGramsQueryFilterFacto
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
+ factory = new CommonGramsQueryFilterFactory();
+ args.put("words", "stop-snowball.txt");
+ args.put("format", "snowball");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getCommonWords();
+ assertEquals(8, words.size());
+ assertTrue(words.contains("he"));
+ assertTrue(words.contains("him"));
+ assertTrue(words.contains("his"));
+ assertTrue(words.contains("himself"));
+ assertTrue(words.contains("she"));
+ assertTrue(words.contains("her"));
+ assertTrue(words.contains("hers"));
+ assertTrue(words.contains("herself"));
}
/**
Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java?rev=1240791&r1=1240790&r2=1240791&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java Sun Feb 5 19:54:18 2012
@@ -53,6 +53,20 @@ public class TestStopFilterFactory exten
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
-
+ factory = new StopFilterFactory();
+ args.put("words", "stop-snowball.txt");
+ args.put("format", "snowball");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getStopWords();
+ assertEquals(8, words.size());
+ assertTrue(words.contains("he"));
+ assertTrue(words.contains("him"));
+ assertTrue(words.contains("his"));
+ assertTrue(words.contains("himself"));
+ assertTrue(words.contains("she"));
+ assertTrue(words.contains("her"));
+ assertTrue(words.contains("hers"));
+ assertTrue(words.contains("herself"));
}
}