You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/03/21 00:02:38 UTC
svn commit: r1303193 - in /lucene/dev/trunk: lucene/contrib/
lucene/test-framework/src/java/org/apache/lucene/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/
modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segm...
Author: mikemccand
Date: Tue Mar 20 23:02:37 2012
New Revision: 1303193
URL: http://svn.apache.org/viewvc?rev=1303193&view=rev
Log:
LUCENE-3894: some tokenizers weren't reading all input chars
Added:
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java (with props)
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1303193&r1=1303192&r2=1303193&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Tue Mar 20 23:02:37 2012
@@ -271,6 +271,10 @@ Bug Fixes
* LUCENE-3831: avoid NPE if the SpanQuery has a null field (eg a
SpanOrQuery with no clauses added). (Alan Woodward via Mike
McCandless).
+
+ * LUCENE-3894: ICUTokenizer, NGramTokenzire and EdgeNGramTokenizer
+ could stop early if the Reader only partially fills the provided
+ buffer
Documentation
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1303193&r1=1303192&r2=1303193&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Mar 20 23:02:37 2012
@@ -177,8 +177,9 @@ public abstract class BaseTokenStreamTes
}
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
ts.end();
- if (finalOffset != null)
+ if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
+ }
if (offsetAtt != null) {
assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
}
@@ -391,6 +392,8 @@ public abstract class BaseTokenStreamTes
List<Integer> startOffsets = new ArrayList<Integer>();
List<Integer> endOffsets = new ArrayList<Integer>();
ts.reset();
+
+ // First pass: save away "correct" tokens
while (ts.incrementToken()) {
tokens.add(termAtt.toString());
if (typeAtt != null) types.add(typeAtt.type());
@@ -403,12 +406,98 @@ public abstract class BaseTokenStreamTes
}
ts.end();
ts.close();
+
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
+
+ // KWTokenizer (for example) can produce a token
+ // even when input is length 0:
+ if (text.length() != 0) {
+
+ // (Optional) second pass: do something evil:
+ final int evilness = random.nextInt(50);
+ if (evilness == 17) {
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
+ }
+ // Throw an errant exception from the Reader:
+
+ MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
+ evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
+ reader = evilReader;
+
+ try {
+ // NOTE: some Tokenizers go and read characters
+ // when you call .setReader(Reader), eg
+ // PatternTokenizer. This is a bit
+ // iffy... (really, they should only
+ // pull from the Reader when you call
+ // .incremenToken(), I think?), but we
+ // currently allow it, so, we must call
+ // a.tokenStream inside the try since we may
+ // hit the exc on init:
+ ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
+ ts.reset();
+ while (ts.incrementToken());
+ fail("did not hit exception");
+ } catch (RuntimeException re) {
+ assertTrue(MockReaderWrapper.isMyEvilException(re));
+ }
+ try {
+ ts.end();
+ } catch (AssertionError ae) {
+ // Catch & ignore MockTokenizer's
+ // anger...
+ if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+ // OK
+ } else {
+ throw ae;
+ }
+ }
+ ts.close();
+ } else if (evilness == 7) {
+ // Only consume a subset of the tokens:
+ final int numTokensToRead = random.nextInt(tokens.size());
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
+ }
+
+ reader = new StringReader(text);
+ ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+ ts.reset();
+ for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
+ assertTrue(ts.incrementToken());
+ }
+ try {
+ ts.end();
+ } catch (AssertionError ae) {
+ // Catch & ignore MockTokenizer's
+ // anger...
+ if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+ // OK
+ } else {
+ throw ae;
+ }
+ }
+ ts.close();
+ }
+ }
+
+ // Final pass: verify clean tokenization matches
+ // results from first pass:
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
+
+ if (random.nextInt(30) == 7) {
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
+ }
+
+ reader = new MockReaderWrapper(random, reader);
+ }
+
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength + type
Added: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java?rev=1303193&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java (added)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java Tue Mar 20 23:02:37 2012
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Random;
+
+import org.apache.lucene.util._TestUtil;
+
+/** Wraps a Reader, and can throw random or fixed
+ * exceptions, and spoon feed read chars. */
+
+public class MockReaderWrapper extends Reader {
+
+ private final Reader in;
+ private final Random random;
+
+ private int excAtChar = -1;
+ private int readSoFar;
+ private boolean throwExcNext;
+
+ public MockReaderWrapper(Random random, Reader in) {
+ this.in = in;
+ this.random = random;
+ }
+
+ /** Throw an exception after reading this many chars. */
+ public void throwExcAfterChar(int charUpto) {
+ excAtChar = charUpto;
+ // You should only call this on init!:
+ assert readSoFar == 0;
+ }
+
+ public void throwExcNext() {
+ throwExcNext = true;
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ if (throwExcNext || (excAtChar != -1 && readSoFar >= excAtChar)) {
+ throw new RuntimeException("fake exception now!");
+ }
+ final int read;
+ final int realLen;
+ if (len == 1) {
+ realLen = 1;
+ } else {
+ // Spoon-feed: intentionally maybe return less than
+ // the consumer asked for
+ realLen = _TestUtil.nextInt(random, 1, len);
+ }
+ if (excAtChar != -1) {
+ final int left = excAtChar - readSoFar;
+ assert left != 0;
+ read = in.read(cbuf, off, Math.min(realLen, left));
+ assert read != -1;
+ readSoFar += read;
+ } else {
+ read = in.read(cbuf, off, realLen);
+ }
+ return read;
+ }
+
+ @Override
+ public boolean markSupported() {
+ return false;
+ }
+
+ @Override
+ public boolean ready() {
+ return false;
+ }
+
+ public static boolean isMyEvilException(Throwable t) {
+ return (t instanceof RuntimeException) && "fake exception now!".equals(t.getMessage());
+ }
+};
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java?rev=1303193&r1=1303192&r2=1303193&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java Tue Mar 20 23:02:37 2012
@@ -199,8 +199,11 @@ public class MockTokenizer extends Token
offsetAtt.setOffset(finalOffset, finalOffset);
// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
// these tests should disable this check (in general you should consume the entire stream)
- assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
- streamState = State.END;
+ try {
+ assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
+ } finally {
+ streamState = State.END;
+ }
}
/**
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=1303193&r1=1303192&r2=1303193&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Tue Mar 20 23:02:37 2012
@@ -183,15 +183,22 @@ public final class EdgeNGramTokenizer ex
// if we are just starting, read the whole input
if (!started) {
started = true;
+ gramSize = minGram;
char[] chars = new char[1024];
- charsRead = input.read(chars);
- if (charsRead < 0) {
- charsRead = inLen = 0;
- return false;
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ while (charsRead < chars.length) {
+ int inc = input.read(chars, charsRead, chars.length-charsRead);
+ if (inc == -1) {
+ break;
+ }
+ charsRead += inc;
}
- inStr = new String(chars, 0, charsRead).trim(); // remove any leading or trailing spaces
+ inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
inLen = inStr.length();
- gramSize = minGram;
+ if (inLen == 0) {
+ return false;
+ }
}
// if the remaining input is too short, we can't generate any n-grams
@@ -223,7 +230,6 @@ public final class EdgeNGramTokenizer ex
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
- reset();
}
@Override
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1303193&r1=1303192&r2=1303193&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Tue Mar 20 23:02:37 2012
@@ -105,13 +105,20 @@ public final class NGramTokenizer extend
started = true;
gramSize = minGram;
char[] chars = new char[1024];
- charsRead = input.read(chars);
- if (charsRead < 0) {
- charsRead = inLen = 0;
- return false;
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ while (charsRead < chars.length) {
+ int inc = input.read(chars, charsRead, chars.length-charsRead);
+ if (inc == -1) {
+ break;
+ }
+ charsRead += inc;
}
- inStr = new String(chars).trim(); // remove any trailing empty strings
+ inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
inLen = inStr.length();
+ if (inLen == 0) {
+ return false;
+ }
}
if (pos+gramSize > inLen) { // if we hit the end of the string
@@ -140,7 +147,6 @@ public final class NGramTokenizer extend
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
- reset();
}
@Override
Modified: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=1303193&r1=1303192&r2=1303193&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Tue Mar 20 23:02:37 2012
@@ -151,8 +151,8 @@ public final class ICUTokenizer extends
int leftover = length - usableLength;
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
int requested = buffer.length - leftover;
- int returned = input.read(buffer, leftover, requested);
- length = returned < 0 ? leftover : returned + leftover;
+ int returned = read(input, buffer, leftover, requested);
+ length = returned + leftover;
if (returned < requested) /* reader has been emptied, process the rest */
usableLength = length;
else { /* still more data to be read, find a safe-stopping place */
@@ -167,6 +167,24 @@ public final class ICUTokenizer extends
breaker.setText(buffer, 0, Math.max(0, usableLength));
}
+ // TODO: refactor to a shared readFully somewhere
+ // (NGramTokenizer does this too):
+ /** commons-io's readFully, but without bugs if offset != 0 */
+ private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
+ assert length >= 0 : "length must not be negative: " + length;
+
+ int remaining = length;
+ while ( remaining > 0 ) {
+ int location = length - remaining;
+ int count = input.read( buffer, offset + location, remaining );
+ if ( -1 == count ) { // EOF
+ break;
+ }
+ remaining -= count;
+ }
+ return length - remaining;
+ }
+
/*
* return true if there is a token from the buffer, or null if it is
* exhausted.