You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/07/10 21:40:34 UTC
svn commit: r793090 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/WordDelimiterFilter.java
src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Author: yonik
Date: Fri Jul 10 19:40:33 2009
New Revision: 793090
URL: http://svn.apache.org/viewvc?rev=793090&view=rev
Log:
SOLR-1266: add WordDelimiterFilter.stemEnglishPossessive option
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Fri Jul 10 19:40:33 2009
@@ -250,6 +250,11 @@
64. SOLR-1256: Show the output of CharFilters in analysis.jsp. (koji)
+65. SOLR-1266: Added stemEnglishPossessive option (default=true) to WordDelimiterFilter
+ that allows disabling of english possessive stemming (removal of trailing 's from tokens)
+ (Robert Muir via yonik)
+
+
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Fri Jul 10 19:40:33 2009
@@ -148,6 +148,13 @@
final int splitOnNumerics;
/**
+ * If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
+ * <p/>
+ * "O'Neil's" => "O", "Neil"
+ */
+ final int stemEnglishPossessive;
+
+ /**
* If not null is the set of tokens to protect from being delimited
*
*/
@@ -165,9 +172,10 @@
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
+ * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
+ public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
@@ -178,14 +186,27 @@
this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable;
this.splitOnNumerics = splitOnNumerics;
+ this.stemEnglishPossessive = stemEnglishPossessive;
this.protWords = protWords;
}
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
+ this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, 1, null);
+ }
/**
* Compatibility constructor
*
* @deprecated Use
- * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
+ * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@@ -203,16 +224,27 @@
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
+ * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
+ public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
+ this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
+ }
+
+ /**
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
+ this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
}
/** * Compatibility constructor
*
* @deprecated Use
- * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@@ -223,7 +255,7 @@
* Compatibility constructor
*
* @deprecated Use
- * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@@ -234,7 +266,7 @@
* Compatibility constructor
*
* @deprecated Use
- * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@@ -388,7 +420,7 @@
// check and remove "'s" from the end of a token.
// the pattern to check for is
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
- if ((lastType & ALPHA)!=0) {
+ if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
if (ch=='\'' && pos+1< len
&& (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
{
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java Fri Jul 10 19:40:33 2009
@@ -71,6 +71,7 @@
int splitOnCaseChange=0;
int splitOnNumerics=0;
int preserveOriginal=0;
+ int stemEnglishPossessive=0;
@Override
public void init(Map<String, String> args) {
@@ -83,6 +84,7 @@
splitOnCaseChange = getInt("splitOnCaseChange", 1);
splitOnNumerics = getInt("splitOnNumerics", 1);
preserveOriginal = getInt("preserveOriginal", 0);
+ stemEnglishPossessive = getInt("stemEnglishPossessive", 1);
}
public WordDelimiterFilter create(TokenStream input) {
@@ -90,6 +92,6 @@
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal,
- splitOnNumerics, protectedWords);
+ splitOnNumerics, stemEnglishPossessive, protectedWords);
}
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=793090&r1=793089&r2=793090&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Fri Jul 10 19:40:33 2009
@@ -359,5 +359,34 @@
}
+
+ public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
+ WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
+ boolean done=false;
+ @Override
+ public Token next() throws IOException {
+ if (done) return null;
+ done = true;
+ return new Token(input,0,input.length());
+ }
+ }
+ ,1,1,0,0,0,1,0,1,stemPossessive,null
+ );
+
+ for(String expected : output) {
+ Token t = wdf.next();
+ assertEquals(expected, t.term());
+ }
+
+ assertEquals(null, wdf.next());
+ }
+
+ /*
+ * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
+ */
+ public void testPossessives() throws Exception {
+ doSplitPossessive(1, "ra's", "ra");
+ doSplitPossessive(0, "ra's", "ra", "s");
+ }
}