You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2008/07/03 17:40:14 UTC
svn commit: r673715 - in /lucene/solr/trunk: ./
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/
src/test/test-files/solr/conf/
Author: yonik
Date: Thu Jul 3 08:40:14 2008
New Revision: 673715
URL: http://svn.apache.org/viewvc?rev=673715&view=rev
Log:
SOLR-14: Add preserveOriginal flag to WordDelimiterFilter
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=673715&r1=673714&r2=673715&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Thu Jul 3 08:40:14 2008
@@ -302,6 +302,9 @@
55. SOLR-603: Added ability to partially optimize. (gsingers)
56. SOLR-483: Add byte/short sorting support (gsingers)
+
+57. SOLR-14: Add preserveOriginal flag to WordDelimiterFilter
+ (Geoffrey Young, Trey Hyde, Ankur Madnani, yonik)
Changes in runtime behavior
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=673715&r1=673714&r2=673715&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Thu Jul 3 08:40:14 2008
@@ -134,6 +134,13 @@
final int splitOnCaseChange;
/**
+ * If 1, original words are preserved and added to the subword list (Defaults to 0)
+ * <p/>
+ * "500-42" => "500" "42" "500-42"
+ */
+ final int preserveOriginal;
+
+ /**
*
* @param in Token stream to be filtered.
* @param charTypeTable
@@ -143,8 +150,9 @@
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+ * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
+ public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
@@ -152,6 +160,7 @@
this.catenateNumbers = catenateNumbers;
this.catenateAll = catenateAll;
this.splitOnCaseChange = splitOnCaseChange;
+ this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable;
}
/**
@@ -162,19 +171,20 @@
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+ * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
*/
- public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange);
+ public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
+ this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
}
/** Compatibility constructor */
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
- this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
+ this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
}
/** Compatibility constructor */
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
+ this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
}
int charType(int ch) {
@@ -242,11 +252,12 @@
// Would it actually be faster to check for the common form
// of isLetter() isLower()*, and then backtrack if it doesn't match?
- int origPosIncrement;
+ int origPosIncrement = 0;
+ Token t;
while(true) {
// t is either returned, or a new token is made from it, so it should
// be safe to use the next(Token) method.
- Token t = input.next(in);
+ t = input.next(in);
if (t == null) return null;
char [] termBuffer = t.termBuffer();
@@ -254,7 +265,7 @@
int start=0;
if (len ==0) continue;
- origPosIncrement = t.getPositionIncrement();
+ origPosIncrement += t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically
// avoid any backtracking).
@@ -348,15 +359,17 @@
return t;
}
- Token newtok = newTok(t,start,pos);
-
// optimization... if this is the only token,
// return it immediately.
- if (queue.size()==0) {
- newtok.setPositionIncrement(origPosIncrement);
- return newtok;
+ if (queue.size()==0 && preserveOriginal == 0) {
+ // just adjust the text w/o changing the rest
+ // of the original token
+ t.setTermBuffer(termBuffer, start, len-start);
+ return t;
}
+ Token newtok = newTok(t,start,pos);
+
queue.add(newtok);
if ((firstType & ALPHA)!=0) numWords++;
break;
@@ -379,14 +392,20 @@
// If the queue is empty, we should continue by reading
// the next token
if (numtok==0) {
+ // the token might have been all delimiters, in which
+ // case return it if we're meant to preserve it
+ if (preserveOriginal != 0) {
+ return t;
+ }
continue;
}
- // if number of tokens is 1, always return the single tok
+ // if number of tokens is 1, there are no catenations to be done.
if (numtok==1) {
break;
}
+
final int numNumbers = numtok - numWords;
// check conditions under which the current token
@@ -411,16 +430,16 @@
if (numWords==0) {
// all numbers
addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || catenateAll!=0, 1);
- if (queue.size() > 0) break; else continue;
+ if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
} else if (numNumbers==0) {
// all words
addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || catenateAll!=0, 1);
- if (queue.size() > 0) break; else continue;
+ if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
} else if (generateNumberParts==0 && generateWordParts==0 && catenateNumbers==0 && catenateWords==0) {
// catenate all *only*
// OPT:could be optimized to add to current queue...
addCombos(tlist,0,numtok,false,catenateAll!=0, 1);
- if (queue.size() > 0) break; else continue;
+ if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
}
//
@@ -454,15 +473,24 @@
// NOTE: in certain cases, queue may be empty (for instance, if catenate
// and generate are both set to false). Only exit the loop if the queue
// is not empty.
- if (queue.size() > 0) break;
+ if (queue.size() > 0 || preserveOriginal!=0) break;
}
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
- queuePos=1;
- Token tok = queue.get(0);
- tok.setPositionIncrement(origPosIncrement);
- return tok;
+ if (preserveOriginal != 0) {
+ queuePos = 0;
+ if (queue.size() > 0) {
+ // overlap first token with the original
+ queue.get(0).setPositionIncrement(0);
+ }
+ return t; // return the original
+ } else {
+ queuePos=1;
+ Token tok = queue.get(0);
+ tok.setPositionIncrement(origPosIncrement);
+ return tok;
+ }
}
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java?rev=673715&r1=673714&r2=673715&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java Thu Jul 3 08:40:14 2008
@@ -30,6 +30,7 @@
int catenateNumbers=0;
int catenateAll=0;
int splitOnCaseChange=0;
+ int preserveOriginal=0;
@Override
public void init(Map<String, String> args) {
@@ -40,12 +41,13 @@
catenateNumbers = getInt("catenateNumbers", 0);
catenateAll = getInt("catenateAll", 0);
splitOnCaseChange = getInt("splitOnCaseChange", 1);
+ preserveOriginal = getInt("preserveOriginal", 0);
}
public WordDelimiterFilter create(TokenStream input) {
return new WordDelimiterFilter(input,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
- splitOnCaseChange);
+ splitOnCaseChange, preserveOriginal);
}
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=673715&r1=673714&r2=673715&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Thu Jul 3 08:40:14 2008
@@ -20,8 +20,10 @@
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.IOException;
+import java.io.StringReader;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
@@ -85,6 +87,49 @@
);
}
+
+ public void testPreserveOrignalTrue() {
+
+ assertU(adoc("id", "144",
+ "wdf_preserve", "404-123"));
+ assertU(commit());
+
+ assertQ("preserving original word",
+ req("wdf_preserve:404")
+ ,"//result[@numFound=1]"
+ );
+
+ assertQ("preserving original word",
+ req("wdf_preserve:123")
+ ,"//result[@numFound=1]"
+ );
+
+ assertQ("preserving original word",
+ req("wdf_preserve:404-123*")
+ ,"//result[@numFound=1]"
+ );
+
+ }
+
+ /***
+ public void testPerformance() throws IOException {
+ String s = "now is the time-for all good men to come to-the aid of their country.";
+ Token tok = new Token();
+ long start = System.currentTimeMillis();
+ int ret=0;
+ for (int i=0; i<1000000; i++) {
+ StringReader r = new StringReader(s);
+ TokenStream ts = new WhitespaceTokenizer(r);
+ ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
+
+ while (ts.next(tok) != null) ret++;
+ }
+
+ System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
+ }
+ ***/
+
+
public void testOffsets() throws IOException {
// test that subwords and catenated subwords have
@@ -98,7 +143,7 @@
return t;
}
},
- 1,1,0,0,1,1);
+ 1,1,0,0,1,1,0);
int i=0;
for(Token t; (t=wdf.next())!=null;) {
@@ -131,7 +176,7 @@
return t;
}
},
- 1,1,0,0,1,1);
+ 1,1,0,0,1,1,0);
for(Token t; (t=wdf.next())!=null;) {
assertEquals(5, t.startOffset());
assertEquals(6, t.endOffset());
Modified: lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml?rev=673715&r1=673714&r2=673715&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml Thu Jul 3 08:40:14 2008
@@ -86,7 +86,15 @@
<fieldtype name="wdf_nocase" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="wdf_preserve" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
@@ -369,6 +377,7 @@
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
<field name="dedup" type="dedup" indexed="true" stored="true"/>
<field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
+ <field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
<field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>