You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by eh...@apache.org on 2009/02/20 04:24:03 UTC
svn commit: r746122 - in /lucene/solr/trunk: example/solr/conf/ src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/

Author: ehatcher
Date: Fri Feb 20 03:23:58 2009
New Revision: 746122

URL: http://svn.apache.org/viewvc?rev=746122&view=rev
Log:
SOLR-1026: Add protected words support to SnowballPorterFilterFactory.  Deprecated EnglishPorterFilterFactory and switched example to use English SnowballPorterFilterFactory instead

Added:
    lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
Modified:
    lucene/solr/trunk/example/solr/conf/schema.xml
    lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java

Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=746122&r1=746121&r2=746122&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Fri Feb 20 03:23:58 2009
@@ -178,7 +178,7 @@
                 />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
         <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
       </analyzer>
       <analyzer type="query">
@@ -191,7 +191,7 @@
                 />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
         <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
       </analyzer>
     </fieldType>
@@ -206,7 +206,7 @@
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
         <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
         <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
       </analyzer>
     </fieldType>

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java?rev=746122&r1=746121&r2=746122&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java Fri Feb 20 03:23:58 2009
@@ -31,6 +31,8 @@
 
 /**
  * @version $Id$
+ *
+ * @deprecated Use SnowballPortFilterFactory with language="English" instead
  */
 public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
   public static final String PROTECTED_TOKENS = "protected";

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java?rev=746122&r1=746121&r2=746122&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java Fri Feb 20 03:23:58 2009
@@ -17,9 +17,18 @@
 package org.apache.solr.analysis;
 
 import java.util.Map;
+import java.util.List;
+import java.io.File;
+import java.io.IOException;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.tartarus.snowball.SnowballProgram;
 
 /**
@@ -30,10 +39,40 @@
  * 
  * @version $Id$
  */
-public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
+public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+  public static final String PROTECTED_TOKENS = "protected";
+
   private String language = "English";
   private Class stemClass;
 
+
+  public void inform(ResourceLoader loader) {
+    String wordFiles = args.get(PROTECTED_TOKENS);
+    if (wordFiles != null) {
+      try {
+        File protectedWordFiles = new File(wordFiles);
+        if (protectedWordFiles.exists()) {
+          List<String> wlist = loader.getLines(wordFiles);
+          //This cast is safe in Lucene
+          protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
+        } else  {
+          List<String> files = StrUtils.splitFileNames(wordFiles);
+          for (String file : files) {
+            List<String> wlist = loader.getLines(file.trim());
+            if (protectedWords == null)
+              protectedWords = new CharArraySet(wlist, false);
+            else
+              protectedWords.addAll(wlist);
+          }
+        }
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  private CharArraySet protectedWords = null;
+
   @Override
   public void init(Map<String, String> args) {
     super.init(args);
@@ -47,14 +86,61 @@
     }
   }
   
-  public SnowballFilter create(TokenStream input) {
+  public SnowballPorterFilter create(TokenStream input) {
     SnowballProgram program;
     try {
       program = (SnowballProgram)stemClass.newInstance();
     } catch (Exception e) {
       throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
     }
-    return new SnowballFilter(input, program);
+    return new SnowballPorterFilter(input, program, protectedWords);
+  }
+}
+
+class SnowballPorterFilter extends TokenFilter {
+  private final CharArraySet protWords;
+  private SnowballProgram stemmer;
+
+  public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
+    super(source);
+    this.protWords = protWords;
+    this.stemmer = stemmer;
+  }
+
+
+  /**
+   * the original code from lucene sandbox
+   * public final Token next() throws IOException {
+   * Token token = input.next();
+   * if (token == null)
+   * return null;
+   * stemmer.setCurrent(token.termText());
+   * try {
+   * stemMethod.invoke(stemmer, EMPTY_ARGS);
+   * } catch (Exception e) {
+   * throw new RuntimeException(e.toString());
+   * }
+   * return new Token(stemmer.getCurrent(),
+   * token.startOffset(), token.endOffset(), token.type());
+   * }
+   */
+
+  @Override
+  public Token next(Token token) throws IOException {
+    Token result = input.next(token);
+    if (result != null) {
+      char[] termBuffer = result.termBuffer();
+      int len = result.termLength();
+      // if protected, don't stem.  use this to avoid stemming collisions.
+      if (protWords != null && protWords.contains(termBuffer, 0, len)) {
+        return result;
+      }
+      stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
+      stemmer.stem();
+      String newstr = stemmer.getCurrent();
+      result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
+    }
+    return result;
   }
 }
 

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java?rev=746122&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java Fri Feb 20 03:23:58 2009
@@ -0,0 +1,97 @@
+package org.apache.solr.analysis;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.common.ResourceLoader;
+import org.tartarus.snowball.ext.EnglishStemmer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.ArrayList;
+import java.util.Collections;
+
+public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
+
+  public void test() throws IOException {
+    EnglishStemmer stemmer = new EnglishStemmer();
+    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
+    StringBuilder gold = new StringBuilder();
+    for (String aTest : test) {
+      stemmer.setCurrent(aTest);
+      stemmer.stem();
+      gold.append(stemmer.getCurrent()).append(' ');
+    }
+
+    SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put("language", "English");
+
+    factory.init(args);
+    factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
+    String out = tsToString(factory.create(new IterTokenStream(test)));
+    assertEquals(gold.toString().trim(), out);
+  }
+
+  public void testProtected() throws Exception {
+    EnglishStemmer stemmer = new EnglishStemmer();
+    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
+    StringBuilder gold = new StringBuilder();
+    for (int i = 0; i < test.length; i++) {
+      if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
+        stemmer.setCurrent(test[i]);
+        stemmer.stem();
+        gold.append(stemmer.getCurrent()).append(' ');
+      } else {
+        gold.append(test[i]).append(' ');
+      }
+    }
+
+    EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
+    factory.init(args);
+    List<String> lines = new ArrayList<String>();
+    Collections.addAll(lines, "banks", "fledgling");
+    factory.inform(new LinesMockSolrResourceLoader(lines));
+    String out = tsToString(factory.create(new IterTokenStream(test)));
+    assertEquals(gold.toString().trim(), out);
+  }
+
+  class LinesMockSolrResourceLoader implements ResourceLoader {
+    List<String> lines;
+
+    LinesMockSolrResourceLoader(List<String> lines) {
+      this.lines = lines;
+    }
+
+    public List<String> getLines(String resource) throws IOException {
+      return lines;
+    }
+
+    public Object newInstance(String cname, String... subpackages) {
+      return null;
+    }
+
+    public InputStream openResource(String resource) throws IOException {
+      return null;
+    }
+  }
+}
+