You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/23 01:08:52 UTC
svn commit: r1234652 - in /lucene/dev/trunk: lucene/
lucene/src/test-framework/java/org/apache/lucene/analysis/
lucene/src/test/org/apache/lucene/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/th/
modules/analysis/common/src/test...
Author: rmuir
Date: Mon Jan 23 00:08:52 2012
New Revision: 1234652
URL: http://svn.apache.org/viewvc?rev=1234652&view=rev
Log:
LUCENE-3717: add better offsets testing to BaseTokenStreamTestCase, fix offsets bugs in ThaiWordFilter and ICUTokenizer
Added:
lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java (with props)
lucene/dev/trunk/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1234652&r1=1234651&r2=1234652&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Jan 23 00:08:52 2012
@@ -814,10 +814,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
-* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
- compound token filters, and smart chinese where they would create invalid
- offsets in some situations, leading to problems in highlighting.
- (Max Beutel, Edwin Steiner via Robert Muir)
+* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters,
+ compound token filters, thai word filter, icutokenizer, and smart chinese
+ where they would create invalid offsets in some situations, leading to problems
+ in highlighting. (Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0
Modified: lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1234652&r1=1234651&r2=1234652&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Mon Jan 23 00:08:52 2012
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
+import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
@@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTes
}
}
};
-
+
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
+ checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
+ }
+
+ public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
switch(_TestUtil.nextInt(random, 0, 4)) {
@@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTes
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
- TokenStream ts = a.tokenStream("dummy", new StringReader(text));
+ int remainder = random.nextInt(10);
+ Reader reader = new StringReader(text);
+ TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTes
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
}
+ reader = new StringReader(text);
+ ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
- toIntArray(positions));
+ toIntArray(positions),
+ text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
- toIntArray(positions));
+ null,
+ toIntArray(positions),
+ text.length());
} else if (offsetAtt != null) {
// offset
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
- toIntArray(endOffsets));
+ toIntArray(endOffsets),
+ null,
+ null,
+ text.length());
} else {
// terms only
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
}
Added: lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java?rev=1234652&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java (added)
+++ lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java Mon Jan 23 00:08:52 2012
@@ -0,0 +1,100 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+// the purpose of this charfilter is to send offsets out of bounds
+// if the analyzer doesn't use correctOffset or does incorrect offset math.
+class MockCharFilter extends CharStream {
+ final Reader in;
+ final int remainder;
+
+ // for testing only
+ public MockCharFilter(Reader in, int remainder) {
+ this.in = in;
+ this.remainder = remainder;
+ assert remainder >= 0 && remainder < 10 : "invalid parameter";
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ int currentOffset = -1;
+ int delta = 0;
+ int bufferedCh = -1;
+
+ @Override
+ public int read() throws IOException {
+ // we have a buffered character, add an offset correction and return it
+ if (bufferedCh >= 0) {
+ int ch = bufferedCh;
+ bufferedCh = -1;
+ currentOffset++;
+
+ addOffCorrectMap(currentOffset+delta, delta-1);
+ delta--;
+ return ch;
+ }
+
+ // otherwise actually read one
+ int ch = in.read();
+ if (ch < 0)
+ return ch;
+
+ currentOffset++;
+ if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
+ return ch;
+ }
+
+ // we will double this character, so buffer it.
+ bufferedCh = ch;
+ return ch;
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ int numRead = 0;
+ for (int i = off; i < off + len; i++) {
+ int c = read();
+ if (c == -1) break;
+ cbuf[i] = (char) c;
+ numRead++;
+ }
+ return numRead == 0 ? -1 : numRead;
+ }
+
+ @Override
+ public int correctOffset(int currentOff) {
+ SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
+ int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
+ assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
+ return ret;
+ }
+
+ protected void addOffCorrectMap(int off, int cumulativeDiff) {
+ corrections.put(off, cumulativeDiff);
+ }
+
+ TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
+}
Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java?rev=1234652&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java Mon Jan 23 00:08:52 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestMockCharFilter extends BaseTokenStreamTestCase {
+
+ public void test() throws IOException {
+ Analyzer analyzer = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new MockCharFilter(CharReader.get(reader), 7);
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "ab",
+ new String[] { "aab" },
+ new int[] { 0 },
+ new int[] { 2 }
+ );
+
+ assertAnalyzesTo(analyzer, "aba",
+ new String[] { "aabaa" },
+ new int[] { 0 },
+ new int[] { 3 }
+ );
+
+ assertAnalyzesTo(analyzer, "abcdefga",
+ new String[] { "aabcdefgaa" },
+ new int[] { 0 },
+ new int[] { 8 }
+ );
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=1234652&r1=1234651&r2=1234652&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Mon Jan 23 00:08:52 2012
@@ -68,6 +68,7 @@ public final class ThaiWordFilter extend
private CharTermAttribute clonedTermAtt = null;
private OffsetAttribute clonedOffsetAtt = null;
private boolean hasMoreTokensInClone = false;
+ private boolean hasIllegalOffsets = false; // only if the length changed before this filter
/** Creates a new ThaiWordFilter with the specified match version. */
public ThaiWordFilter(Version matchVersion, TokenStream input) {
@@ -86,7 +87,11 @@ public final class ThaiWordFilter extend
if (end != BreakIterator.DONE) {
clonedToken.copyTo(this);
termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
- offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+ if (hasIllegalOffsets) {
+ offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+ } else {
+ offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+ }
if (handlePosIncr) posAtt.setPositionIncrement(1);
return true;
}
@@ -102,6 +107,10 @@ public final class ThaiWordFilter extend
}
hasMoreTokensInClone = true;
+
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
// we lazy init the cloned token, as in ctor not all attributes may be added
if (clonedToken == null) {
@@ -118,7 +127,11 @@ public final class ThaiWordFilter extend
int end = breaker.next();
if (end != BreakIterator.DONE) {
termAtt.setLength(end);
- offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+ if (hasIllegalOffsets) {
+ offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+ } else {
+ offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+ }
// position increment keeps as it is for first token
return true;
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1234652&r1=1234651&r2=1234652&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Mon Jan 23 00:08:52 2012
@@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest ext
@Override
protected Reader initReader(Reader reader) {
- return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+ return new HTMLStripCharFilter(CharReader.get(reader));
}
};
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java?rev=1234652&r1=1234651&r2=1234652&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java Mon Jan 23 00:08:52 2012
@@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase
hasSentence = false;
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
- offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+ offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
return true;
} else {
return false;
@@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase
clearAttributes();
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
- offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+ offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
posBoost = 0;
return true;
Modified: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=1234652&r1=1234651&r2=1234652&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Mon Jan 23 00:08:52 2012
@@ -111,7 +111,7 @@ public final class ICUTokenizer extends
@Override
public void end() throws IOException {
final int finalOffset = (length < 0) ? offset : offset + length;
- offsetAtt.setOffset(finalOffset, finalOffset);
+ offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
}
/*