You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/22 17:49:00 UTC
svn commit: r1234548 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/analyzers/ lucene/contrib/analyzers/common/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/
lucene/contrib/analyzers/common/src/test/org/a...
Author: rmuir
Date: Sun Jan 22 16:48:59 2012
New Revision: 1234548
URL: http://svn.apache.org/viewvc?rev=1234548&view=rev
Log:
SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1234548&r1=1234547&r2=1234548&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Sun Jan 22 16:48:59 2012
@@ -126,9 +126,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
-* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
- where they would create invalid offsets in some situations, leading to problems
- in highlighting. (Max Beutel via Robert Muir)
+* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
+ compound token filters, and smart chinese where they would create invalid
+ offsets in some situations, leading to problems in highlighting.
+ (Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1234548&r1=1234547&r2=1234548&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Sun Jan 22 16:48:59 2012
@@ -223,13 +223,22 @@ public abstract class CompoundWordTokenF
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
public CompoundToken(int offset, int length) {
- final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
- // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
- // chars from the term, offsets may not match correctly (other filters producing tokens
- // may also have this problem):
- this.startOffset = newStart;
- this.endOffset = newStart + length;
+
+ // offsets of the original word
+ int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
+ int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
+
+ if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ this.startOffset = startOff;
+ this.endOffset = endOff;
+ } else {
+ final int newStart = startOff + offset;
+ this.startOffset = newStart;
+ this.endOffset = newStart + length;
+ }
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1234548&r1=1234547&r2=1234548&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Sun Jan 22 16:48:59 2012
@@ -18,14 +18,20 @@ package org.apache.lucene.analysis.compo
*/
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.MappingCharFilter;
+import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Attribute;
@@ -292,5 +298,35 @@ public class TestCompoundWordTokenFilter
}
}
}
+
+ // SOLR-2891
+ // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
+ // wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
+ // so in this case we behave like WDF, and preserve any modified offsets
+ public void testInvalidOffsets() throws Exception {
+ final String[] dict = { "fall" };
+ final NormalizeCharMap normMap = new NormalizeCharMap();
+ normMap.add("ü", "ue");
+
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+
+ //@Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+
+ //@Override
+ protected Reader initReader(Reader reader) {
+ return new MappingCharFilter(normMap, CharReader.get(reader));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "banküberfall",
+ new String[] { "bankueberfall", "fall" },
+ new int[] { 0, 0 },
+ new int[] { 12, 12 });
+ }
}