You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/05 18:41:28 UTC
svn commit: r1529482 - in /lucene/dev/trunk: lucene/
lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/
lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/
lucene/analysis/kuromoji/src/test/org/apache/lucene/analys...
Author: rmuir
Date: Sat Oct 5 16:41:28 2013
New Revision: 1529482
URL: http://svn.apache.org/r1529482
Log:
LUCENE-5240: additional safety in Tokenizer state machine
Removed:
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TrieField.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestTrie.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Oct 5 16:41:28 2013
@@ -80,6 +80,10 @@ New Features
on best effort which was not user-friendly.
(Uwe Schindler, Robert Muir)
+* LUCENE-5240: Tokenizers now throw an IllegalStateException if the
+ consumer neglects to call close() on the previous stream before consuming
+ the next one. (Uwe Schindler, Robert Muir)
+
* LUCENE-5214: Add new FreeTextSuggester, to predict the next word
using a simple ngram language model. This is useful for the "long
tail" suggestions, when a primary suggester fails to find a
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java Sat Oct 5 16:41:28 2013
@@ -48,6 +48,7 @@ public class CommonGramsFilterTest exten
assertEquals("the", term.toString());
assertTrue(cgf.incrementToken());
assertEquals("the_s", term.toString());
+ cgf.close();
wt.setReader(new StringReader(input));
cgf.reset();
@@ -67,6 +68,7 @@ public class CommonGramsFilterTest exten
assertEquals("How_the", term.toString());
assertTrue(nsf.incrementToken());
assertEquals("the_s", term.toString());
+ nsf.close();
wt.setReader(new StringReader(input));
nsf.reset();
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Sat Oct 5 16:41:28 2013
@@ -240,6 +240,8 @@ public class TestCompoundWordTokenFilter
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
assertTrue(tf.incrementToken());
assertEquals("Rind", termAtt.toString());
+ tf.end();
+ tf.close();
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
tf.reset();
assertTrue(tf.incrementToken());
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java Sat Oct 5 16:41:28 2013
@@ -59,6 +59,8 @@ public class TestExtendedMode extends Ba
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
+ ts.end();
+ ts.close();
}
}
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java Sat Oct 5 16:41:28 2013
@@ -217,6 +217,8 @@ public class TestJapaneseTokenizer exten
ts.reset();
while (ts.incrementToken()) {
}
+ ts.end();
+ ts.close();
}
}
@@ -240,6 +242,8 @@ public class TestJapaneseTokenizer exten
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
+ ts.end();
+ ts.close();
}
}
@@ -630,6 +634,8 @@ public class TestJapaneseTokenizer exten
final TokenStream ts = analyzer.tokenStream("ignored", line);
ts.reset();
while(ts.incrementToken());
+ ts.end();
+ ts.close();
}
String[] sentences = line.split("ã|ã");
if (VERBOSE) {
@@ -642,6 +648,8 @@ public class TestJapaneseTokenizer exten
final TokenStream ts = analyzer.tokenStream("ignored", sentence);
ts.reset();
while(ts.incrementToken());
+ ts.end();
+ ts.close();
}
}
if (VERBOSE) {
Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java Sat Oct 5 16:41:28 2013
@@ -90,6 +90,8 @@ public class ReadTokensTask extends Perf
termAtt.fillBytesRef();
tokenCount++;
}
+ stream.end();
+ stream.close();
}
totalTokenCount += tokenCount;
return tokenCount;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java Sat Oct 5 16:41:28 2013
@@ -85,8 +85,9 @@ public abstract class Tokenizer extends
public final void setReader(Reader input) throws IOException {
if (input == null) {
throw new NullPointerException("input must not be null");
+ } else if (this.input != ILLEGAL_STATE_READER) {
+ throw new IllegalStateException("TokenStream contract violation: close() call missing");
}
- this.input = ILLEGAL_STATE_READER;
this.inputPending = input;
assert setReaderTestPoint();
}
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Oct 5 16:41:28 2013
@@ -401,6 +401,20 @@ public abstract class BaseTokenStreamTes
ts.end();
ts.close();
}
+
+ // check for a missing close()
+ ts = a.tokenStream("bogus", input);
+ ts.reset();
+ while (ts.incrementToken()) {}
+ ts.end();
+ try {
+ ts = a.tokenStream("bogus", input);
+ fail("didn't get expected exception when close() not called");
+ } catch (IllegalStateException expected) {
+ // ok
+ } finally {
+ ts.close();
+ }
}
// simple utility method for testing stemmers
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java Sat Oct 5 16:41:28 2013
@@ -266,6 +266,9 @@ public abstract class CollationTestBase
termAtt.fillBytesRef();
// ensure we make a copy of the actual bytes too
map.put(term, BytesRef.deepCopyOf(bytes));
+ assertFalse(ts.incrementToken());
+ ts.end();
+ ts.close();
}
Thread threads[] = new Thread[numThreads];
@@ -284,6 +287,9 @@ public abstract class CollationTestBase
assertTrue(ts.incrementToken());
termAtt.fillBytesRef();
assertEquals(expected, bytes);
+ assertFalse(ts.incrementToken());
+ ts.end();
+ ts.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java Sat Oct 5 16:41:28 2013
@@ -30,6 +30,7 @@ import org.apache.lucene.util.AttributeS
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
@@ -138,9 +139,10 @@ public abstract class AnalysisRequestHan
* @param analyzer The analyzer to use.
*/
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
+ TokenStream tokenStream = null;
try {
final Set<BytesRef> tokens = new HashSet<BytesRef>();
- final TokenStream tokenStream = analyzer.tokenStream("", query);
+ tokenStream = analyzer.tokenStream("", query);
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
final BytesRef bytes = bytesAtt.getBytesRef();
@@ -152,10 +154,11 @@ public abstract class AnalysisRequestHan
}
tokenStream.end();
- tokenStream.close();
return tokens;
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
+ } finally {
+ IOUtils.closeWhileHandlingException(tokenStream);
}
}
@@ -181,8 +184,11 @@ public abstract class AnalysisRequestHan
trackerAtt.setActPosition(position);
tokens.add(tokenStream.cloneAttributes());
}
+ tokenStream.end();
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
+ } finally {
+ IOUtils.closeWhileHandlingException(tokenStream);
}
return tokens;
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TrieField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TrieField.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TrieField.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/schema/TrieField.java Sat Oct 5 16:41:28 2013
@@ -24,8 +24,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
-import org.apache.lucene.analysis.util.CharFilterFactory;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FieldType.NumericType;
@@ -51,8 +49,6 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.mutable.MutableValueDate;
import org.apache.lucene.util.mutable.MutableValueLong;
-import org.apache.solr.analysis.TokenizerChain;
-import org.apache.solr.analysis.TrieTokenizerFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
@@ -111,12 +107,6 @@ public class TrieField extends Primitive
"Invalid type specified in schema.xml for field: " + args.get("name"), e);
}
}
-
- CharFilterFactory[] filterFactories = new CharFilterFactory[0];
- TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
- analyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, precisionStep), tokenFilterFactories);
- // for query time we only need one token, so we use the biggest possible precisionStep:
- queryAnalyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, Integer.MAX_VALUE), tokenFilterFactories);
}
@Override
@@ -223,7 +213,7 @@ public class TrieField extends Primitive
@Override
public boolean isTokenized() {
- return true;
+ return false;
}
@Override
@@ -382,24 +372,29 @@ public class TrieField extends Primitive
@Override
public void readableToIndexed(CharSequence val, BytesRef result) {
String s = val.toString();
- switch (type) {
- case INTEGER:
- NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
- break;
- case FLOAT:
- NumericUtils.intToPrefixCodedBytes(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
- break;
- case LONG:
- NumericUtils.longToPrefixCodedBytes(Long.parseLong(s), 0, result);
- break;
- case DOUBLE:
- NumericUtils.longToPrefixCodedBytes(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
- break;
- case DATE:
- NumericUtils.longToPrefixCodedBytes(dateField.parseMath(null, s).getTime(), 0, result);
- break;
- default:
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
+ try {
+ switch (type) {
+ case INTEGER:
+ NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
+ break;
+ case FLOAT:
+ NumericUtils.intToPrefixCodedBytes(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
+ break;
+ case LONG:
+ NumericUtils.longToPrefixCodedBytes(Long.parseLong(s), 0, result);
+ break;
+ case DOUBLE:
+ NumericUtils.longToPrefixCodedBytes(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
+ break;
+ case DATE:
+ NumericUtils.longToPrefixCodedBytes(dateField.parseMath(null, s).getTime(), 0, result);
+ break;
+ default:
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
+ }
+ } catch (NumberFormatException nfe) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ "Invalid Number: " + val);
}
}
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestTrie.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestTrie.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestTrie.java Sat Oct 5 16:41:28 2013
@@ -16,8 +16,6 @@
*/
package org.apache.solr;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.DateField;
import org.apache.solr.schema.FieldType;
@@ -49,38 +47,6 @@ public class TestTrie extends SolrTestCa
clearIndex();
super.tearDown();
}
-
- @Test
- public void testTokenizer() throws Exception {
- FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
- assertTrue(type instanceof TrieField);
-
- String value = String.valueOf(random().nextInt());
- TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
- OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
- ts.reset();
- int count = 0;
- while (ts.incrementToken()) {
- count++;
- assertEquals(0, ofsAtt.startOffset());
- assertEquals(value.length(), ofsAtt.endOffset());
- }
- final int precStep = ((TrieField) type).getPrecisionStep();
- assertEquals( (32 + precStep - 1) / precStep, count);
- ts.end();
- assertEquals(value.length(), ofsAtt.startOffset());
- assertEquals(value.length(), ofsAtt.endOffset());
- ts.close();
-
- // Test empty one:
- ts = type.getAnalyzer().tokenStream("dummy", "");
- ts.reset();
- assertFalse(ts.incrementToken());
- ts.end();
- assertEquals(0, ofsAtt.startOffset());
- assertEquals(0, ofsAtt.endOffset());
- ts.close();
- }
@Test
public void testTrieIntRangeSearch() throws Exception {
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java?rev=1529482&r1=1529481&r2=1529482&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java Sat Oct 5 16:41:28 2013
@@ -39,7 +39,7 @@ public class TestFieldTypeResource exten
"/response/lst[@name='fieldType']/bool[@name='omitPositions'] = 'false'",
"/response/lst[@name='fieldType']/bool[@name='storeOffsetsWithPositions'] = 'false'",
"/response/lst[@name='fieldType']/bool[@name='multiValued'] = 'false'",
- "/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'true'",
+ "/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'false'",
"/response/lst[@name='fieldType']/arr[@name='fields']/str = 'weight'",
"/response/lst[@name='fieldType']/arr[@name='dynamicFields']/str = '*_f'");
}
@@ -69,7 +69,7 @@ public class TestFieldTypeResource exten
"/fieldType/omitPositions==false",
"/fieldType/storeOffsetsWithPositions==false",
"/fieldType/multiValued==false",
- "/fieldType/tokenized==true",
+ "/fieldType/tokenized==false",
"/fieldType/fields==['weight']",
"/fieldType/dynamicFields==['*_f']");
}