You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by eh...@apache.org on 2009/02/20 04:24:03 UTC
svn commit: r746122 - in /lucene/solr/trunk: example/solr/conf/
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/
Author: ehatcher
Date: Fri Feb 20 03:23:58 2009
New Revision: 746122
URL: http://svn.apache.org/viewvc?rev=746122&view=rev
Log:
SOLR-1026: Add protected words support to SnowballPorterFilterFactory. Deprecated EnglishPorterFilterFactory and switched example to use English SnowballPorterFilterFactory instead
Added:
lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
Modified:
lucene/solr/trunk/example/solr/conf/schema.xml
lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=746122&r1=746121&r2=746122&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Fri Feb 20 03:23:58 2009
@@ -178,7 +178,7 @@
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
@@ -191,7 +191,7 @@
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
@@ -206,7 +206,7 @@
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java?rev=746122&r1=746121&r2=746122&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java Fri Feb 20 03:23:58 2009
@@ -31,6 +31,8 @@
/**
* @version $Id$
+ *
+ * @deprecated Use SnowballPortFilterFactory with language="English" instead
*/
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java?rev=746122&r1=746121&r2=746122&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java Fri Feb 20 03:23:58 2009
@@ -17,9 +17,18 @@
package org.apache.solr.analysis;
import java.util.Map;
+import java.util.List;
+import java.io.File;
+import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram;
/**
@@ -30,10 +39,40 @@
*
* @version $Id$
*/
-public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
+public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ public static final String PROTECTED_TOKENS = "protected";
+
private String language = "English";
private Class stemClass;
+
+ public void inform(ResourceLoader loader) {
+ String wordFiles = args.get(PROTECTED_TOKENS);
+ if (wordFiles != null) {
+ try {
+ File protectedWordFiles = new File(wordFiles);
+ if (protectedWordFiles.exists()) {
+ List<String> wlist = loader.getLines(wordFiles);
+ //This cast is safe in Lucene
+ protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
+ } else {
+ List<String> files = StrUtils.splitFileNames(wordFiles);
+ for (String file : files) {
+ List<String> wlist = loader.getLines(file.trim());
+ if (protectedWords == null)
+ protectedWords = new CharArraySet(wlist, false);
+ else
+ protectedWords.addAll(wlist);
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private CharArraySet protectedWords = null;
+
@Override
public void init(Map<String, String> args) {
super.init(args);
@@ -47,14 +86,61 @@
}
}
- public SnowballFilter create(TokenStream input) {
+ public SnowballPorterFilter create(TokenStream input) {
SnowballProgram program;
try {
program = (SnowballProgram)stemClass.newInstance();
} catch (Exception e) {
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
}
- return new SnowballFilter(input, program);
+ return new SnowballPorterFilter(input, program, protectedWords);
+ }
+}
+
+class SnowballPorterFilter extends TokenFilter {
+ private final CharArraySet protWords;
+ private SnowballProgram stemmer;
+
+ public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
+ super(source);
+ this.protWords = protWords;
+ this.stemmer = stemmer;
+ }
+
+
+ /**
+ * the original code from lucene sandbox
+ * public final Token next() throws IOException {
+ * Token token = input.next();
+ * if (token == null)
+ * return null;
+ * stemmer.setCurrent(token.termText());
+ * try {
+ * stemMethod.invoke(stemmer, EMPTY_ARGS);
+ * } catch (Exception e) {
+ * throw new RuntimeException(e.toString());
+ * }
+ * return new Token(stemmer.getCurrent(),
+ * token.startOffset(), token.endOffset(), token.type());
+ * }
+ */
+
+ @Override
+ public Token next(Token token) throws IOException {
+ Token result = input.next(token);
+ if (result != null) {
+ char[] termBuffer = result.termBuffer();
+ int len = result.termLength();
+ // if protected, don't stem. use this to avoid stemming collisions.
+ if (protWords != null && protWords.contains(termBuffer, 0, len)) {
+ return result;
+ }
+ stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
+ stemmer.stem();
+ String newstr = stemmer.getCurrent();
+ result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
+ }
+ return result;
}
}
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java?rev=746122&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java Fri Feb 20 03:23:58 2009
@@ -0,0 +1,97 @@
+package org.apache.solr.analysis;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.common.ResourceLoader;
+import org.tartarus.snowball.ext.EnglishStemmer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.ArrayList;
+import java.util.Collections;
+
+public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
+
+ public void test() throws IOException {
+ EnglishStemmer stemmer = new EnglishStemmer();
+ String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
+ StringBuilder gold = new StringBuilder();
+ for (String aTest : test) {
+ stemmer.setCurrent(aTest);
+ stemmer.stem();
+ gold.append(stemmer.getCurrent()).append(' ');
+ }
+
+ SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
+ Map<String, String> args = new HashMap<String, String>();
+ args.put("language", "English");
+
+ factory.init(args);
+ factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
+ String out = tsToString(factory.create(new IterTokenStream(test)));
+ assertEquals(gold.toString().trim(), out);
+ }
+
+ public void testProtected() throws Exception {
+ EnglishStemmer stemmer = new EnglishStemmer();
+ String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
+ StringBuilder gold = new StringBuilder();
+ for (int i = 0; i < test.length; i++) {
+ if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
+ stemmer.setCurrent(test[i]);
+ stemmer.stem();
+ gold.append(stemmer.getCurrent()).append(' ');
+ } else {
+ gold.append(test[i]).append(' ');
+ }
+ }
+
+ EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
+ Map<String, String> args = new HashMap<String, String>();
+ args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
+ factory.init(args);
+ List<String> lines = new ArrayList<String>();
+ Collections.addAll(lines, "banks", "fledgling");
+ factory.inform(new LinesMockSolrResourceLoader(lines));
+ String out = tsToString(factory.create(new IterTokenStream(test)));
+ assertEquals(gold.toString().trim(), out);
+ }
+
+ class LinesMockSolrResourceLoader implements ResourceLoader {
+ List<String> lines;
+
+ LinesMockSolrResourceLoader(List<String> lines) {
+ this.lines = lines;
+ }
+
+ public List<String> getLines(String resource) throws IOException {
+ return lines;
+ }
+
+ public Object newInstance(String cname, String... subpackages) {
+ return null;
+ }
+
+ public InputStream openResource(String resource) throws IOException {
+ return null;
+ }
+ }
+}
+