You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ho...@apache.org on 2008/12/18 09:50:27 UTC
svn commit: r727677 - in /lucene/solr/trunk: ./
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/
src/test/test-files/solr/conf/
Author: hossman
Date: Thu Dec 18 00:50:27 2008
New Revision: 727677
URL: http://svn.apache.org/viewvc?rev=727677&view=rev
Log:
SOLR-876: WordDelimiterFilter splitOnNumerics and protwords
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
lucene/solr/trunk/src/test/test-files/solr/conf/protwords.txt
lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=727677&r1=727676&r2=727677&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Thu Dec 18 00:50:27 2008
@@ -118,6 +118,9 @@
optimized distributed faceting refinement by lowering parsing overhead and
by making requests and responses smaller.
+15. SOLR-876: WOrdDelimiterFilter now supports a splitOnNumerics
+ option, as well as a list of protected terms.
+ (Dan Rosher via hossman)
Optimizations
----------------------
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=727677&r1=727676&r2=727677&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Thu Dec 18 00:50:27 2008
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException;
import java.util.ArrayList;
@@ -141,6 +142,18 @@
final int preserveOriginal;
/**
+ * If 0, causes numeric changes to be ignored (subwords will only be generated
+ * given SUBWORD_DELIM tokens). (Defaults to 1)
+ */
+ final int splitOnNumerics;
+
+ /**
+ * If not null is the set of tokens to protect from being delimited
+ *
+ */
+ final CharArraySet protWords;
+
+ /**
*
* @param in Token stream to be filtered.
* @param charTypeTable
@@ -151,8 +164,10 @@
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
+ * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
+ * @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
+ public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
@@ -162,7 +177,22 @@
this.splitOnCaseChange = splitOnCaseChange;
this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable;
+ this.splitOnNumerics = splitOnNumerics;
+ this.protWords = protWords;
}
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
+ this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
+ }
+
/**
* @param in Token stream to be filtered.
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
@@ -172,7 +202,20 @@
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
+ * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
+ * @param protWords If not null is the set of tokens to protect from being delimited
+ */
+ public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
+ this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
+ }
+
+ /** * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
+ * instead.
*/
+ @Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
}
@@ -180,23 +223,23 @@
* Compatibility constructor
*
* @deprecated Use
- * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
- this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
+ this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
}
/**
* Compatibility constructor
*
* @deprecated Use
- * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
+ this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
}
int charType(int ch) {
@@ -273,6 +316,11 @@
int start=0;
if (len ==0) continue;
+ //skip protected tokens
+ if (protWords != null && protWords.contains(termBuffer, 0, len)) {
+ return t;
+ }
+
origPosIncrement += t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically
@@ -344,6 +392,9 @@
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
// UPPER->LOWER: Don't split
+ } else if(splitOnNumerics == 0 &&
+ ( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
+ // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
} else {
// NOTE: this code currently assumes that only one flag
// is set for each character now, so we don't have
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java?rev=727677&r1=727676&r2=727677&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java Thu Dec 18 00:50:27 2008
@@ -17,19 +17,59 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
+
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+
import java.util.Map;
+import java.io.File;
+import java.util.List;
+import java.io.IOException;
+
/**
* @version $Id$
*/
-public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
+public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ public static final String PROTECTED_TOKENS = "protected";
+
+ public void inform(ResourceLoader loader) {
+ String wordFiles = args.get(PROTECTED_TOKENS);
+ if (wordFiles != null) {
+ try {
+ File protectedWordFiles = new File(wordFiles);
+ if (protectedWordFiles.exists()) {
+ List<String> wlist = loader.getLines(wordFiles);
+ //This cast is safe in Lucene
+ protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
+ } else {
+ List<String> files = StrUtils.splitFileNames(wordFiles);
+ for (String file : files) {
+ List<String> wlist = loader.getLines(file.trim());
+ if (protectedWords == null)
+ protectedWords = new CharArraySet(wlist, false);
+ else
+ protectedWords.addAll(wlist);
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private CharArraySet protectedWords = null;
+
int generateWordParts=0;
int generateNumberParts=0;
int catenateWords=0;
int catenateNumbers=0;
int catenateAll=0;
int splitOnCaseChange=0;
+ int splitOnNumerics=0;
int preserveOriginal=0;
@Override
@@ -41,6 +81,7 @@
catenateNumbers = getInt("catenateNumbers", 0);
catenateAll = getInt("catenateAll", 0);
splitOnCaseChange = getInt("splitOnCaseChange", 1);
+ splitOnNumerics = getInt("splitOnNumerics", 1);
preserveOriginal = getInt("preserveOriginal", 0);
}
@@ -48,6 +89,7 @@
return new WordDelimiterFilter(input,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
- splitOnCaseChange, preserveOriginal);
+ splitOnCaseChange, preserveOriginal,
+ splitOnNumerics, protectedWords);
}
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=727677&r1=727676&r2=727677&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Thu Dec 18 00:50:27 2008
@@ -278,4 +278,53 @@
assertEquals(12, t.startOffset());
assertEquals(15, t.endOffset());
}
+
+ public void testAlphaNumericWords(){
+ assertU(adoc("id", "68","numericsubword","Java/J2SE"));
+ assertU(commit());
+
+ assertQ("j2se found",
+ req("numericsubword:(J2SE)")
+ ,"//result[@numFound=1]"
+ );
+ assertQ("no j2 or se",
+ req("numericsubword:(J2 OR SE)")
+ ,"//result[@numFound=0]"
+ );
+ }
+
+ public void testProtectedWords(){
+ assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
+ assertU(commit());
+
+ assertQ("java found",
+ req("protectedsubword:(java)")
+ ,"//result[@numFound=1]"
+ );
+
+ assertQ(".net found",
+ req("protectedsubword:(.net)")
+ ,"//result[@numFound=1]"
+ );
+
+ assertQ("c# found",
+ req("protectedsubword:(c#)")
+ ,"//result[@numFound=1]"
+ );
+
+ assertQ("c++ found",
+ req("protectedsubword:(c++)")
+ ,"//result[@numFound=1]"
+ );
+
+ assertQ("c found?",
+ req("protectedsubword:c")
+ ,"//result[@numFound=0]"
+ );
+ assertQ("net found?",
+ req("protectedsubword:net")
+ ,"//result[@numFound=0]"
+ );
+ }
+
}
Modified: lucene/solr/trunk/src/test/test-files/solr/conf/protwords.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/protwords.txt?rev=727677&r1=727676&r2=727677&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/protwords.txt (original)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/protwords.txt Thu Dec 18 00:50:27 2008
@@ -18,3 +18,6 @@
#to test, we will use words that would normally obviously be stemmed.
cats
ridding
+c#
+c++
+.net
Modified: lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml?rev=727677&r1=727676&r2=727677&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml Thu Dec 18 00:50:27 2008
@@ -252,6 +252,36 @@
</analyzer>
</fieldtype>
+ <fieldtype name="numericsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.EnglishPorterFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.EnglishPorterFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="protectedsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+
<!-- more flexible in matching skus, but more chance of a false match -->
<fieldtype name="skutype1" class="solr.TextField">
<analyzer type="index">
@@ -387,6 +417,9 @@
<field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
<field name="subword" type="subword" indexed="true" stored="true"/>
+ <field name="numericsubword" type="numericsubword" indexed="true" stored="true"/>
+ <field name="protectedsubword" type="protectedsubword" indexed="true" stored="true"/>
+
<field name="sku1" type="skutype1" indexed="true" stored="true"/>
<field name="sku2" type="skutype2" indexed="true" stored="true"/>