You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/10/10 15:37:37 UTC
lucene-solr:branch_6x: LUCENE-7476: JapaneseNumberFilter should not
invoke incrementToken on its input after it's exhausted
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x cdb42ec46 -> e585c84f2
LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken on its input after it's exhausted
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e585c84f
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e585c84f
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e585c84f
Branch: refs/heads/branch_6x
Commit: e585c84f244f4b8c8752152e9f232eba192041dc
Parents: cdb42ec
Author: Mike McCandless <mi...@apache.org>
Authored: Mon Oct 10 11:37:23 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Mon Oct 10 11:37:23 2016 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../analysis/ja/JapaneseNumberFilter.java | 11 +
.../lucene/analysis/ja/TestFactories.java | 203 +++++++++++++++++++
3 files changed, 217 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e585c84f/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b3bcc85..a5693f0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -29,6 +29,9 @@ Bug Fixes
* LUCENE-7484: FastVectorHighlighter failed to highlight SynonymQuery
(Jim Ferenczi via Mike McCandless)
+* LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken
+ on its input after it's exhausted (Andy Hind via Mike McCandless)
+
Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e585c84f/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
index a1af95e..9f4c1d5 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
@@ -104,6 +104,8 @@ public class JapaneseNumberFilter extends TokenFilter {
private StringBuilder numeral;
private int fallThroughTokens;
+
+ private boolean exhausted = false;
static {
numerals = new char[0x10000];
@@ -149,7 +151,12 @@ public class JapaneseNumberFilter extends TokenFilter {
return true;
}
+ if (exhausted) {
+ return false;
+ }
+
if (!input.incrementToken()) {
+ exhausted = true;
return false;
}
@@ -184,6 +191,9 @@ public class JapaneseNumberFilter extends TokenFilter {
endOffset = offsetAttr.endOffset();
moreTokens = input.incrementToken();
+ if (moreTokens == false) {
+ exhausted = true;
+ }
if (posIncrAttr.getPositionIncrement() == 0) {
// This token is a stacked/synonym token, capture number of tokens "under" this token,
@@ -227,6 +237,7 @@ public class JapaneseNumberFilter extends TokenFilter {
fallThroughTokens = 0;
numeral = new StringBuilder();
state = null;
+ exhausted = false;
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e585c84f/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
new file mode 100644
index 0000000..bae8ffc
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.Version;
+
+/**
+ * Sanity check some things about all factories,
+ * we do our best to see if we can sanely initialize it with
+ * no parameters and smoke test it, etc.
+ */
+// TODO: this was copied from the analysis/common module ... find a better way to share it!
+
+// TODO: fix this to use CustomAnalyzer instead of its own FactoryAnalyzer
+public class TestFactories extends BaseTokenStreamTestCase {
+ public void test() throws IOException {
+ for (String tokenizer : TokenizerFactory.availableTokenizers()) {
+ doTestTokenizer(tokenizer);
+ }
+
+ for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
+ doTestTokenFilter(tokenFilter);
+ }
+
+ for (String charFilter : CharFilterFactory.availableCharFilters()) {
+ doTestCharFilter(charFilter);
+ }
+ }
+
+ private void doTestTokenizer(String tokenizer) throws IOException {
+ Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
+ TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
+ if (factory != null) {
+ // we managed to fully create an instance. check a few more things:
+
+ // if it implements MultiTermAware, sanity check its impl
+ if (factory instanceof MultiTermAwareComponent) {
+ AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
+ assertNotNull(mtc);
+ // it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
+ assertFalse(mtc instanceof CharFilterFactory);
+ }
+
+ // beast it just a little, it shouldnt throw exceptions:
+ // (it should have thrown them in initialize)
+ Analyzer a = new FactoryAnalyzer(factory, null, null);
+ checkRandomData(random(), a, 20, 20, false, false);
+ a.close();
+ }
+ }
+
+ private void doTestTokenFilter(String tokenfilter) throws IOException {
+ Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter);
+ TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
+ if (factory != null) {
+ // we managed to fully create an instance. check a few more things:
+
+ // if it implements MultiTermAware, sanity check its impl
+ if (factory instanceof MultiTermAwareComponent) {
+ AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
+ assertNotNull(mtc);
+ // it's not ok to return a charfilter or tokenizer here, this makes no sense
+ assertTrue(mtc instanceof TokenFilterFactory);
+ }
+
+ // beast it just a little, it shouldnt throw exceptions:
+ // (it should have thrown them in initialize)
+ Analyzer a = new FactoryAnalyzer(assertingTokenizer, factory, null);
+ checkRandomData(random(), a, 20, 20, false, false);
+ a.close();
+ }
+ }
+
+ private void doTestCharFilter(String charfilter) throws IOException {
+ Class<? extends CharFilterFactory> factoryClazz = CharFilterFactory.lookupClass(charfilter);
+ CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
+ if (factory != null) {
+ // we managed to fully create an instance. check a few more things:
+
+ // if it implements MultiTermAware, sanity check its impl
+ if (factory instanceof MultiTermAwareComponent) {
+ AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
+ assertNotNull(mtc);
+ // it's not ok to return a tokenizer or tokenfilter here, this makes no sense
+ assertTrue(mtc instanceof CharFilterFactory);
+ }
+
+ // beast it just a little, it shouldnt throw exceptions:
+ // (it should have thrown them in initialize)
+ Analyzer a = new FactoryAnalyzer(assertingTokenizer, null, factory);
+ checkRandomData(random(), a, 20, 20, false, false);
+ a.close();
+ }
+ }
+
+ /** tries to initialize a factory with no arguments */
+ private AbstractAnalysisFactory initialize(Class<? extends AbstractAnalysisFactory> factoryClazz) throws IOException {
+ Map<String,String> args = new HashMap<>();
+ args.put("luceneMatchVersion", Version.LATEST.toString());
+ Constructor<? extends AbstractAnalysisFactory> ctor;
+ try {
+ ctor = factoryClazz.getConstructor(Map.class);
+ } catch (Exception e) {
+ throw new RuntimeException("factory '" + factoryClazz + "' does not have a proper ctor!");
+ }
+
+ AbstractAnalysisFactory factory = null;
+ try {
+ factory = ctor.newInstance(args);
+ } catch (InstantiationException | IllegalAccessException e) {
+ throw new RuntimeException(e);
+ } catch (InvocationTargetException e) {
+ if (e.getCause() instanceof IllegalArgumentException) {
+ // it's ok if we dont provide the right parameters to throw this
+ return null;
+ }
+ }
+
+ if (factory instanceof ResourceLoaderAware) {
+ try {
+ ((ResourceLoaderAware) factory).inform(new StringMockResourceLoader(""));
+ } catch (IOException ignored) {
+ // it's ok if the right files arent available or whatever to throw this
+ } catch (IllegalArgumentException ignored) {
+ // is this ok? I guess so
+ }
+ }
+ return factory;
+ }
+
+ // some silly classes just so we can use checkRandomData
+ private TokenizerFactory assertingTokenizer = new TokenizerFactory(new HashMap<String,String>()) {
+ @Override
+ public MockTokenizer create(AttributeFactory factory) {
+ return new MockTokenizer(factory);
+ }
+ };
+
+ private static class FactoryAnalyzer extends Analyzer {
+ final TokenizerFactory tokenizer;
+ final CharFilterFactory charFilter;
+ final TokenFilterFactory tokenfilter;
+
+ FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) {
+ assert tokenizer != null;
+ this.tokenizer = tokenizer;
+ this.charFilter = charFilter;
+ this.tokenfilter = tokenfilter;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tf = tokenizer.create(newAttributeFactory());
+ if (tokenfilter != null) {
+ return new TokenStreamComponents(tf, tokenfilter.create(tf));
+ } else {
+ return new TokenStreamComponents(tf);
+ }
+ }
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ if (charFilter != null) {
+ return charFilter.create(reader);
+ } else {
+ return reader;
+ }
+ }
+ }
+}