You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/08/26 15:32:46 UTC
svn commit: r808005 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/WordDelimiterFilter.java
src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Author: yonik
Date: Wed Aug 26 13:32:46 2009
New Revision: 808005
URL: http://svn.apache.org/viewvc?rev=808005&view=rev
Log:
SOLR-1362: fix WDF position increment bugs
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=808005&r1=808004&r2=808005&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Wed Aug 26 13:32:46 2009
@@ -509,6 +509,11 @@
already been closed/destroyed; if it hasn't a, SEVERE error is
logged first. (noble, hossman)
+60. SOLR-1362: WordDelimiterFilter had inconsistent behavior when setting
+ the position increment of tokens following a token consisting of all
+ delimiters, and could additionally lose big position increments.
+ (Robert Muir, yonik
+
Other Changes
----------------------
1. Upgraded to Lucene 2.4.0 (yonik)
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=808005&r1=808004&r2=808005&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Wed Aug 26 13:32:46 2009
@@ -384,13 +384,15 @@
int start=0;
if (len ==0) continue;
+ int posInc = t.getPositionIncrement();
+ origPosIncrement += posInc;
+
//skip protected tokens
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
+ t.setPositionIncrement(origPosIncrement);
return t;
}
- origPosIncrement += t.getPositionIncrement();
-
// Avoid calling charType more than once for each char (basically
// avoid any backtracking).
// makes code slightly more difficult, but faster.
@@ -482,6 +484,7 @@
if (start==0) {
// the subword is the whole original token, so
// return it unchanged.
+ t.setPositionIncrement(origPosIncrement);
return t;
}
@@ -492,6 +495,7 @@
// of the original token
t.setTermBuffer(termBuffer, start, len-start);
t.setStartOffset(t.startOffset() + start);
+ t.setPositionIncrement(origPosIncrement);
return t;
}
@@ -524,6 +528,9 @@
if (preserveOriginal != 0) {
return t;
}
+
+ // if this token had a "normal" gap of 1, remove it.
+ if (posInc==1) origPosIncrement-=1;
continue;
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=808005&r1=808004&r2=808005&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Wed Aug 26 13:32:46 2009
@@ -18,12 +18,21 @@
package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
+import java.util.Arrays;
+import java.util.HashSet;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
@@ -388,5 +397,113 @@
doSplitPossessive(1, "ra's", "ra");
doSplitPossessive(0, "ra's", "ra", "s");
}
+
+ /*
+ * Set a large position increment gap of 10 if the token is "largegap" or "/"
+ */
+ private final class LargePosIncTokenFilter extends TokenFilter {
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncAtt;
+
+ protected LargePosIncTokenFilter(TokenStream input) {
+ super(input);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (termAtt.term().equals("largegap") || termAtt.term().equals("/"))
+ posIncAtt.setPositionIncrement(10);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ public void testPositionIncrements() throws Exception {
+ final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ public TokenStream tokenStream(String field, Reader reader) {
+ return new WordDelimiterFilter(
+ new WhitespaceTokenizer(reader),
+ 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
+ }
+ };
+ /* in this case, works as expected. */
+ assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+ new int[] { 0, 9 },
+ new int[] { 6, 13 },
+ new int[] { 1, 1 });
+
+ /* only in this case, posInc of 2 ?! */
+ assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
+ new int[] { 0, 9, 12, 9 },
+ new int[] { 6, 12, 13, 13 },
+ new int[] { 1, 1, 1, 0 });
+
+ assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+ new int[] { 0, 9, 15 },
+ new int[] { 6, 14, 19 },
+ new int[] { 1, 1, 1 });
+
+ /* analyzer that will consume tokens with large position increments */
+ Analyzer a2 = new Analyzer() {
+ public TokenStream tokenStream(String field, Reader reader) {
+ return new WordDelimiterFilter(
+ new LargePosIncTokenFilter(
+ new WhitespaceTokenizer(reader)),
+ 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
+ }
+ };
+
+ /* increment of "largegap" is preserved */
+ assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
+ new int[] { 0, 7, 16 },
+ new int[] { 6, 15, 20 },
+ new int[] { 1, 10, 1 });
+
+ /* the "/" had a position increment of 10, where did it go?!?!! */
+ assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+ new int[] { 0, 9 },
+ new int[] { 6, 13 },
+ new int[] { 1, 11 });
+
+ /* in this case, the increment of 10 from the "/" is carried over */
+ assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
+ new int[] { 0, 9, 12, 9 },
+ new int[] { 6, 12, 13, 13 },
+ new int[] { 1, 11, 1, 0 });
+
+ assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+ new int[] { 0, 9, 15 },
+ new int[] { 6, 14, 19 },
+ new int[] { 1, 11, 1 });
+ }
+
+ private void assertAnalyzesTo(Analyzer a, String input, String[] output,
+ int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
+
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts
+ .getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts
+ .getAttribute(OffsetAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
+ .getAttribute(PositionIncrementAttribute.class);
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(output[i], termAtt.term());
+ assertEquals(startOffsets[i], offsetAtt.startOffset());
+ assertEquals(endOffsets[i], offsetAtt.endOffset());
+ assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
+ }
+ assertFalse(ts.incrementToken());
+ ts.close();
+ }
}