You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/01/17 22:43:33 UTC
svn commit: r900222 - in /lucene/java/branches/lucene_2_9: ./ contrib/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/
contrib/analyzers/common/src/test/org/apache/lucen...
Author: rmuir
Date: Sun Jan 17 21:43:32 2010
New Revision: 900222
URL: http://svn.apache.org/viewvc?rev=900222&view=rev
Log:
LUCENE-2207: CJKTokenizer generates tokens with incorrect offsets
LUCENE-2219: Chinese, SmartChinese, Wikipedia tokenizers generate incorrect offsets, test end() in BaseTokenStreamTestCase
Modified:
lucene/java/branches/lucene_2_9/ (props changed)
lucene/java/branches/lucene_2_9/CHANGES.txt (contents, props changed)
lucene/java/branches/lucene_2_9/build.xml (props changed)
lucene/java/branches/lucene_2_9/contrib/ (props changed)
lucene/java/branches/lucene_2_9/contrib/CHANGES.txt (contents, props changed)
lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
lucene/java/branches/lucene_2_9/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
lucene/java/branches/lucene_2_9/contrib/highlighter/src/test/ (props changed)
lucene/java/branches/lucene_2_9/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (props changed)
lucene/java/branches/lucene_2_9/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
lucene/java/branches/lucene_2_9/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/analysis/Tokenizer.java (props changed)
lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (props changed)
lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/util/AttributeSource.java (props changed)
lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (contents, props changed)
lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (props changed)
lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestDateTools.java (props changed)
lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestNumberTools.java (props changed)
lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (props changed)
lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/util/TestAttributeSource.java (props changed)
Propchange: lucene/java/branches/lucene_2_9/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,3 +1,3 @@
/lucene/java/branches/lucene_2_4:748824
-/lucene/java/branches/lucene_3_0:886275,899639
-/lucene/java/trunk:821888,824125,826029,826385,830871,833095,833297,833886,881819,882672,883554,884870,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0:886275,899639,900212
+/lucene/java/trunk:821888,824125,826029,826385,830871,833095,833297,833886,881819,882672,883554,884870,886257,887347,887532,891189,891363,897672,899627,900196
Modified: lucene/java/branches/lucene_2_9/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/CHANGES.txt?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/CHANGES.txt (original)
+++ lucene/java/branches/lucene_2_9/CHANGES.txt Sun Jan 17 21:43:32 2010
@@ -43,11 +43,6 @@
* LUCENE-2086: When resolving deleted terms, do so in term sort order
for better performance (Bogdan Ghidireac via Mike McCandless)
-Test Cases
-
- * LUCENE-2114: Change TestFilteredSearch to test on multi-segment
- index as well. (Simon Willnauer via Mike McCandless)
-
Documentation
* LUCENE-2114: Improve javadocs of Filter to call out that the
@@ -56,10 +51,16 @@
Test Cases
+ * LUCENE-2114: Change TestFilteredSearch to test on multi-segment
+ index as well. (Simon Willnauer via Mike McCandless)
+
* LUCENE-2211: Improves BaseTokenStreamTestCase to use a fake attribute
that checks if clearAttributes() was called correctly.
(Uwe Schindler, Robert Muir)
+ * LUCENE-2207, LUCENE-2219: Improve BaseTokenStreamTestCase to check if
+ end() is implemented correctly. (Koji Sekiguchi, Robert Muir)
+
======================= Release 2.9.1 2009-11-06 =======================
Changes in backwards compatibility policy
@@ -3914,4 +3915,4 @@
The code has been re-organized into a new package and directory
structure for this release. It builds OK, but has not been tested
-beyond that since the re-organization.
\ No newline at end of file
+beyond that since the re-organization.
Propchange: lucene/java/branches/lucene_2_9/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/CHANGES.txt:899639
-/lucene/java/trunk/CHANGES.txt:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/CHANGES.txt:899639,900212
+/lucene/java/trunk/CHANGES.txt:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Propchange: lucene/java/branches/lucene_2_9/build.xml
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1 +1,2 @@
-/lucene/java/trunk/build.xml:821888,899627
+/lucene/java/branches/lucene_3_0/build.xml:900212
+/lucene/java/trunk/build.xml:821888,899627,900196
Propchange: lucene/java/branches/lucene_2_9/contrib/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/contrib:899639
-/lucene/java/trunk/contrib:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/contrib:899639,900212
+/lucene/java/trunk/contrib:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Modified: lucene/java/branches/lucene_2_9/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/CHANGES.txt?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/CHANGES.txt (original)
+++ lucene/java/branches/lucene_2_9/contrib/CHANGES.txt Sun Jan 17 21:43:32 2010
@@ -29,6 +29,10 @@
EdgeNGramTokenFilter, Highlighter, and MemoryIndex.
(Uwe Schindler, Robert Muir)
+ * LUCENE-2207, LUCENE-2219: Fix incorrect offset calculations in end() for
+ CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
+ and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
+
======================= Release 2.9.1 2009-11-06 =======================
Changes in backwards compatibility policy
Propchange: lucene/java/branches/lucene_2_9/contrib/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1 +1,2 @@
-/lucene/java/trunk/contrib/CHANGES.txt:821888,899627
+/lucene/java/branches/lucene_3_0/contrib/CHANGES.txt:900212
+/lucene/java/trunk/contrib/CHANGES.txt:821888,899627,900196
Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Sun Jan 17 21:43:32 2010
@@ -176,9 +176,13 @@
length = 0;
preIsTokened = false;
}
+ else{
+ offset--;
+ }
break;
} else {
+ offset--;
return false;
}
} else {
@@ -289,6 +293,7 @@
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
return true;
} else if (dataLen == -1) {
+ offset--;
return false;
}
@@ -299,7 +304,7 @@
public final void end() {
// set final offset
- final int finalOffset = offset;
+ final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java Sun Jan 17 21:43:32 2010
@@ -128,8 +128,10 @@
bufferIndex = 0;
}
- if (dataLen == -1) return flush();
- else
+ if (dataLen == -1) {
+ offset--;
+ return flush();
+ } else
c = ioBuffer[bufferIndex++];
@@ -160,7 +162,7 @@
public final void end() {
// set final offset
- final int finalOffset = offset;
+ final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Sun Jan 17 21:43:32 2010
@@ -18,14 +18,10 @@
*/
import java.io.IOException;
-import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
@@ -47,33 +43,33 @@
}
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
- CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
- TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class);
- TypeAttribute typeAtt = (TypeAttribute) tokenizer.getAttribute(TypeAttribute.class);
+ Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
+ String terms[] = new String[out_tokens.length];
+ int startOffsets[] = new int[out_tokens.length];
+ int endOffsets[] = new int[out_tokens.length];
+ String types[] = new String[out_tokens.length];
for (int i = 0; i < out_tokens.length; i++) {
- assertTrue(tokenizer.incrementToken());
- assertEquals(termAtt.term(), out_tokens[i].termText);
- assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
- assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
- assertEquals(typeAtt.type(), out_tokens[i].type);
+ terms[i] = out_tokens[i].termText;
+ startOffsets[i] = out_tokens[i].start;
+ endOffsets[i] = out_tokens[i].end;
+ types[i] = out_tokens[i].type;
}
- assertFalse(tokenizer.incrementToken());
+ assertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
}
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
- TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str));
- TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
- TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
+ Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
+ String terms[] = new String[out_tokens.length];
+ int startOffsets[] = new int[out_tokens.length];
+ int endOffsets[] = new int[out_tokens.length];
+ String types[] = new String[out_tokens.length];
for (int i = 0; i < out_tokens.length; i++) {
- assertTrue(ts.incrementToken());
- assertEquals(termAtt.term(), out_tokens[i].termText);
- assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
- assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
- assertEquals(typeAtt.type(), out_tokens[i].type);
+ terms[i] = out_tokens[i].termText;
+ startOffsets[i] = out_tokens[i].start;
+ endOffsets[i] = out_tokens[i].end;
+ types[i] = out_tokens[i].type;
}
- assertFalse(ts.incrementToken());
+ assertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
}
public void testJa1() throws IOException {
@@ -219,13 +215,8 @@
public void testTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer();
- TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
- TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
- assertTrue(ts.incrementToken());
- assertEquals("\u4e00\u4e01", termAtt.term());
- assertTrue(ts.incrementToken());
- assertEquals("\u4e01\u4e02", termAtt.term());
- assertFalse(ts.incrementToken());
+ assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
+ new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
}
public void testReusableTokenStream() throws Exception {
@@ -261,4 +252,24 @@
};
checkCJKTokenReusable(analyzer, str, out_tokens2);
}
+
+ /**
+ * LUCENE-2207: wrong offset calculated by end()
+ */
+ public void testFinalOffset() throws IOException {
+ checkCJKToken("ãã", new TestToken[] {
+ newToken("ãã", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+ checkCJKToken("ãã ", new TestToken[] {
+ newToken("ãã", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+ checkCJKToken("test", new TestToken[] {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+ checkCJKToken("test ", new TestToken[] {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+ checkCJKToken("ããtest", new TestToken[] {
+ newToken("ãã", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+ checkCJKToken("testãã ", new TestToken[] {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("ãã", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+ }
}
Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Sun Jan 17 21:43:32 2010
@@ -65,33 +65,33 @@
public void testFrontUnigram() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1});
+ assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1}, new Integer(5) /* abcde */);
}
public void testBackUnigram() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5});
+ assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5}, new Integer(5) /* abcde */);
}
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
- assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+ assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], new Integer(5) /* abcde */);
}
public void testFrontRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+ assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, new Integer(5) /* abcde */);
}
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
+ assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, new Integer(5) /* abcde */);
}
public void testReset() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+ assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, new Integer(5) /* abcde */);
tokenizer.reset(new StringReader("abcde"));
- assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+ assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, new Integer(5) /* abcde */);
}
}
Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Sun Jan 17 21:43:32 2010
@@ -57,12 +57,12 @@
public void testUnigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, new Integer(5) /* abcde */);
}
public void testBigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
- assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, new Integer(5) /* abcde */);
}
public void testNgrams() throws Exception {
@@ -70,19 +70,20 @@
assertTokenStreamContents(tokenizer,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
+ new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+ new Integer(5) /* abcde */
);
}
public void testOversizedNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
- assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+ assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], new Integer(5) /* abcde */);
}
public void testReset() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, new Integer(5) /* abcde */);
tokenizer.reset(new StringReader("abcde"));
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, new Integer(5) /* abcde */);
}
}
Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Sun Jan 17 21:43:32 2010
@@ -131,4 +131,10 @@
super.reset(input);
reset();
}
+
+ public void end() throws IOException {
+ // set final offset
+ final int finalOffset = correctOffset(tokenEnd);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
}
Propchange: lucene/java/branches/lucene_2_9/contrib/highlighter/src/test/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1 +1,2 @@
-/lucene/java/trunk/contrib/highlighter/src/test:821888,899627
+/lucene/java/branches/lucene_3_0/contrib/highlighter/src/test:900212
+/lucene/java/trunk/contrib/highlighter/src/test:821888,899627,900196
Propchange: lucene/java/branches/lucene_2_9/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1 +1,2 @@
-/lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:821888,899627
+/lucene/java/branches/lucene_3_0/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:900212
+/lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:821888,899627,900196
Modified: lucene/java/branches/lucene_2_9/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Sun Jan 17 21:43:32 2010
@@ -331,4 +331,9 @@
reset();
}
-}
\ No newline at end of file
+ public void end() throws IOException {
+ // set final offset
+ final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ this.offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+}
Modified: lucene/java/branches/lucene_2_9/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (original)
+++ lucene/java/branches/lucene_2_9/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java Sun Jan 17 21:43:32 2010
@@ -45,9 +45,15 @@
}
public void testSimple() throws Exception {
- WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader("This is a [[Category:foo]]"));
+ String text = "This is a [[Category:foo]]";
+ WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(text));
assertTokenStreamContents(tf,
- new String[] { "This", "is", "a", "foo" });
+ new String[] { "This", "is", "a", "foo" },
+ new int[] { 0, 5, 8, 21 },
+ new int[] { 4, 7, 9, 24 },
+ new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY },
+ new int[] { 1, 1, 1, 1, },
+ new Integer(text.length()));
}
public void testHandwritten() throws Exception {
Propchange: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/analysis/Tokenizer.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1 +1,2 @@
-/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java:821888,899627
+/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/analysis/Tokenizer.java:900212
+/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java:821888,899627,900196
Propchange: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:899639
-/lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:899639,900212
+/lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Propchange: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/util/AttributeSource.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,3 +1,3 @@
/lucene/java/branches/lucene_2_4/src/java/org/apache/lucene/util/AttributeSource.java:748824
-/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/util/AttributeSource.java:886275,899639
+/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/util/AttributeSource.java:886275,899639,900212
/lucene/java/trunk/src/java/org/apache/lucene/util/AttributeSource.java:821888,824125,826029,826385,830871,833095,833297,833886,881819,882672,883554,884870,886257,887347,887532,891189,891363,894348,897672
Modified: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=900222&r1=900221&r2=900222&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sun Jan 17 21:43:32 2010
@@ -127,7 +127,7 @@
}
}
- public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute) ts.addAttribute(CheckClearAttributesAttribute.class);
@@ -135,7 +135,7 @@
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = null;
- if (startOffsets != null || endOffsets != null) {
+ if (startOffsets != null || endOffsets != null || finalOffset != null) {
assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
}
@@ -177,32 +177,45 @@
}
assertFalse("end of stream", ts.incrementToken());
ts.end();
+ if (finalOffset != null)
+ assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
ts.close();
}
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
+ }
+
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
- assertTokenStreamContents(ts, output, null, null, null, null);
+ assertTokenStreamContents(ts, output, null, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
- assertTokenStreamContents(ts, output, null, null, types, null);
+ assertTokenStreamContents(ts, output, null, null, types, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
- assertTokenStreamContents(ts, output, null, null, null, posIncrements);
+ assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
+ }
+
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
}
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
+ }
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
- assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements);
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, new Integer(input.length()));
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
@@ -227,7 +240,7 @@
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
- assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements);
+ assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, new Integer(input.length()));
}
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
Propchange: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,3 +1,3 @@
/lucene/java/branches/lucene_2_4/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java:748824
-/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java:886275,899639
+/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java:886275,899639,900212
/lucene/java/trunk/src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java:818920,821888,824125,826029,826385,830871,833095,833297,833886,881819,882672,883554,884870,887347,887532,891189,891363,897672
Propchange: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:899639
-/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:899639,900212
+/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Propchange: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestDateTools.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/document/TestDateTools.java:899639
-/lucene/java/trunk/src/test/org/apache/lucene/document/TestDateTools.java:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/document/TestDateTools.java:899639,900212
+/lucene/java/trunk/src/test/org/apache/lucene/document/TestDateTools.java:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Propchange: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestNumberTools.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/document/TestNumberTools.java:899639
-/lucene/java/trunk/src/test/org/apache/lucene/document/TestNumberTools.java:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/document/TestNumberTools.java:899639,900212
+/lucene/java/trunk/src/test/org/apache/lucene/document/TestNumberTools.java:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Propchange: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:899639
-/lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:899639,900212
+/lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196
Propchange: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/util/TestAttributeSource.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Sun Jan 17 21:43:32 2010
@@ -1,2 +1,2 @@
-/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/util/TestAttributeSource.java:899639
-/lucene/java/trunk/src/test/org/apache/lucene/util/TestAttributeSource.java:821888,881819,886257,887347,887532,891189,891363,897672,899627
+/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/util/TestAttributeSource.java:899639,900212
+/lucene/java/trunk/src/test/org/apache/lucene/util/TestAttributeSource.java:821888,881819,886257,887347,887532,891189,891363,897672,899627,900196