You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/07/10 21:40:34 UTC

svn commit: r793090 - in /lucene/solr/trunk: CHANGES.txt src/java/org/apache/solr/analysis/WordDelimiterFilter.java src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java

Author: yonik
Date: Fri Jul 10 19:40:33 2009
New Revision: 793090

URL: http://svn.apache.org/viewvc?rev=793090&view=rev
Log:
SOLR-1266: add WordDelimiterFilter.stemEnglishPossessive option

Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Fri Jul 10 19:40:33 2009
@@ -250,6 +250,11 @@
 
 64. SOLR-1256: Show the output of CharFilters in analysis.jsp. (koji)
 
+65. SOLR-1266: Added stemEnglishPossessive option (default=true) to WordDelimiterFilter
+    that allows disabling of english possessive stemming (removal of trailing 's from tokens)
+    (Robert Muir via yonik)
+
+
 Optimizations
 ----------------------
  1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Fri Jul 10 19:40:33 2009
@@ -148,6 +148,13 @@
   final int splitOnNumerics;
 
   /**
+   * If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
+   * <p/>
+   * "O'Neil's" => "O", "Neil"
+   */
+  final int stemEnglishPossessive;
+  
+  /**
    * If not null is the set of tokens to protect from being delimited
    *
    */
@@ -165,9 +172,10 @@
    * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
    * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
+   * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
+  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
     super(in);
     this.generateWordParts = generateWordParts;
     this.generateNumberParts = generateNumberParts;
@@ -178,14 +186,27 @@
     this.preserveOriginal = preserveOriginal;
     this.charTypeTable = charTypeTable;
     this.splitOnNumerics = splitOnNumerics;
+    this.stemEnglishPossessive = stemEnglishPossessive;
     this.protWords = protWords;
   }
+  
+  /**
+   * Compatibility constructor
+   * 
+   * @deprecated Use
+   *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
+   *             instead.
+   */
+  @Deprecated
+  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
+    this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, 1, null);
+  }
 
   /**
    * Compatibility constructor
    * 
    * @deprecated Use
-   *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
+   *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
    *             instead.
    */
   @Deprecated
@@ -203,16 +224,27 @@
    * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
    * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
+   * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
+  public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
+    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
+  }
+  
+  /**
+   * @deprecated Use
+   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
+   *             instead.
+   */
+  @Deprecated
   public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
+    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
   }
 
   /**   * Compatibility constructor
    * 
    * @deprecated Use
-   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
    *             instead.
    */
   @Deprecated
@@ -223,7 +255,7 @@
    * Compatibility constructor
    * 
    * @deprecated Use
-   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
    *             instead.
    */
   @Deprecated
@@ -234,7 +266,7 @@
    * Compatibility constructor
    * 
    * @deprecated Use
-   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
    *             instead.
    */
   @Deprecated
@@ -388,7 +420,7 @@
             // check and remove "'s" from the end of a token.
             // the pattern to check for is
             //   ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
-            if ((lastType & ALPHA)!=0) {
+            if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
               if (ch=='\'' && pos+1< len
                       && (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
               {

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java Fri Jul 10 19:40:33 2009
@@ -71,6 +71,7 @@
   int splitOnCaseChange=0;
   int splitOnNumerics=0;
   int preserveOriginal=0;
+  int stemEnglishPossessive=0;
 
   @Override
   public void init(Map<String, String> args) {
@@ -83,6 +84,7 @@
     splitOnCaseChange = getInt("splitOnCaseChange", 1);
     splitOnNumerics = getInt("splitOnNumerics", 1);
     preserveOriginal = getInt("preserveOriginal", 0);
+    stemEnglishPossessive = getInt("stemEnglishPossessive", 1);
   }
 
   public WordDelimiterFilter create(TokenStream input) {
@@ -90,6 +92,6 @@
                                    generateWordParts, generateNumberParts,
                                    catenateWords, catenateNumbers, catenateAll,
                                    splitOnCaseChange, preserveOriginal,
-                                   splitOnNumerics, protectedWords);
+                                   splitOnNumerics, stemEnglishPossessive, protectedWords);
   }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Fri Jul 10 19:40:33 2009
@@ -359,5 +359,34 @@
 
 
   }
+  
+  public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
+    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
+      boolean done=false;
+      @Override
+      public Token next() throws IOException {
+        if (done) return null;
+        done = true;
+        return new Token(input,0,input.length());
+      }
+    }
+            ,1,1,0,0,0,1,0,1,stemPossessive,null
+    );
+
+    for(String expected : output) {
+      Token t = wdf.next();
+      assertEquals(expected, t.term());
+    }
+
+    assertEquals(null, wdf.next());
+  }
+  
+  /*
+   * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. 
+   */
+  public void testPossessives() throws Exception {
+    doSplitPossessive(1, "ra's", "ra");
+    doSplitPossessive(0, "ra's", "ra", "s");
+  }
 
 }