You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/11 05:53:42 UTC
svn commit: r1531186 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/
analysis/common/src/java/org/apache/lucene/analysis/ngram/
analysis/common/src/resources/META-INF/services/ analysis/common/src/te...
Author: rmuir
Date: Fri Oct 11 03:53:42 2013
New Revision: 1531186
URL: http://svn.apache.org/r1531186
Log:
LUCENE-5269: Fix NGramTokenFilter length filtering
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Oct 11 03:53:42 2013
@@ -131,6 +131,10 @@ Bug Fixes
terms were present in the query and the high-frequent operator was set
to SHOULD. (Simon Willnauer)
+* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
+ unicode characters incorrectly. Adds CodepointCountFilter.
+ (Mike McCandless, Robert Muir)
+
API Changes:
* LUCENE-5222: Add SortField.needsScores(). Previously it was not possible
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java?rev=1531186&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java Fri Oct 11 03:53:42 2013
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Removes words that are too long or too short from the stream.
+ * <p>
+ * Note: Length is calculated as the number of Unicode codepoints.
+ * </p>
+ */
+public final class CodepointCountFilter extends FilteringTokenFilter {
+
+ private final int min;
+ private final int max;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Create a new {@link CodepointCountFilter}. This will filter out tokens whose
+ * {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)}
+ * < min) or too long ({@link Character#codePointCount(char[], int, int)} > max).
+ * @param version the Lucene match version
+ * @param in the {@link TokenStream} to consume
+ * @param min the minimum length
+ * @param max the maximum length
+ */
+ public CodepointCountFilter(Version version, TokenStream in, int min, int max) {
+ super(version, in);
+ this.min = min;
+ this.max = max;
+ }
+
+ @Override
+ public boolean accept() {
+ final int max32 = termAtt.length();
+ final int min32 = max32 >> 1;
+ if (min32 >= min && max32 <= max) {
+ // definitely within range
+ return true;
+ } else if (min32 > max || max32 < min) {
+ // definitely not
+ return false;
+ } else {
+ // we must count to be sure
+ int len = Character.codePointCount(termAtt.buffer(), 0, termAtt.length());
+ return (len >= min && len <= max);
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java?rev=1531186&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java Fri Oct 11 03:53:42 2013
@@ -0,0 +1,55 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link CodepointCountFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.CodepointCountFilterFactory" min="0" max="1" />
+ * </analyzer>
+ * </fieldType></pre>
+ */
+public class CodepointCountFilterFactory extends TokenFilterFactory {
+ final int min;
+ final int max;
+ public static final String MIN_KEY = "min";
+ public static final String MAX_KEY = "max";
+
+ /** Creates a new CodepointCountFilterFactory */
+ public CodepointCountFilterFactory(Map<String, String> args) {
+ super(args);
+ min = requireInt(args, MIN_KEY);
+ max = requireInt(args, MAX_KEY);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public CodepointCountFilter create(TokenStream input) {
+ return new CodepointCountFilter(luceneMatchVersion, input, min, max);
+ }
+}
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Fri Oct 11 03:53:42 2013
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.LengthFilter;
+import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -81,7 +81,7 @@ public final class NGramTokenFilter exte
* @param maxGram the largest n-gram to generate
*/
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
- super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
+ super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
this.charUtils = version.onOrAfter(Version.LUCENE_44)
? CharacterUtils.getInstance(version)
Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Fri Oct 11 03:53:42 2013
@@ -55,6 +55,7 @@ org.apache.lucene.analysis.it.ItalianLig
org.apache.lucene.analysis.lv.LatvianStemFilterFactory
org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
+org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java Fri Oct 11 03:53:42 2013
@@ -1,5 +1,6 @@
package org.apache.lucene.analysis.core;
+import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
@@ -11,10 +12,14 @@ import org.apache.lucene.analysis.MockCh
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
+import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/*
@@ -195,4 +200,58 @@ public class TestBugInSomething extends
assertEquals("read(char[], int, int)", e.getMessage());
}
}
+
+ // todo: test framework?
+
+ static final class SopTokenFilter extends TokenFilter {
+
+ SopTokenFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ System.out.println(input.getClass().getSimpleName() + ".end()");
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ System.out.println(input.getClass().getSimpleName() + ".close()");
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ System.out.println(input.getClass().getSimpleName() + ".reset()");
+ }
+ }
+
+ // LUCENE-5269
+ public void testUnicodeShinglesAndNgrams() throws Exception {
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
+ //TokenStream stream = new SopTokenFilter(tokenizer);
+ TokenStream stream = new ShingleFilter(tokenizer, 54);
+ //stream = new SopTokenFilter(stream);
+ stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
+ //stream = new SopTokenFilter(stream);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+ checkRandomData(random(), analyzer, 10);
+ }
}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java?rev=1531186&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java Fri Oct 11 03:53:42 2013
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util._TestUtil;
+
+public class TestCodepointCountFilter extends BaseTokenStreamTestCase {
+ public void testFilterWithPosIncr() throws Exception {
+ TokenStream stream = new MockTokenizer(
+ new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
+ CodepointCountFilter filter = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, 2, 6);
+ assertTokenStreamContents(filter,
+ new String[]{"short", "ab", "foo"},
+ new int[]{1, 4, 2}
+ );
+ }
+
+ public void testEmptyTerm() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new CodepointCountFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
+ }
+ };
+ checkOneTerm(a, "", "");
+ }
+
+ public void testRandomStrings() throws IOException {
+ for (int i = 0; i < 10000; i++) {
+ String text = _TestUtil.randomUnicodeString(random(), 100);
+ int min = _TestUtil.nextInt(random(), 0, 100);
+ int max = _TestUtil.nextInt(random(), 0, 100);
+ int count = text.codePointCount(0, text.length());
+ boolean expected = count >= min && count <= max;
+ TokenStream stream = new KeywordTokenizer(new StringReader(text));
+ stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max);
+ stream.reset();
+ assertEquals(expected, stream.incrementToken());
+ stream.end();
+ stream.close();
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java?rev=1531186&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java Fri Oct 11 03:53:42 2013
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestCodepointCountFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void testPositionIncrements() throws Exception {
+ Reader reader = new StringReader("foo foobar super-duper-trooper");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = tokenFilterFactory("CodepointCount",
+ "min", "4",
+ "max", "10").create(stream);
+ assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ tokenFilterFactory("CodepointCount",
+ "min", "4",
+ "max", "5",
+ "bogusArg", "bogusValue");
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}
\ No newline at end of file
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Fri Oct 11 03:53:42 2013
@@ -169,15 +169,20 @@ public class EdgeNGramTokenFilterTest ex
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer,
- new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
- }
- };
- checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ for (int i = 0; i < 10; i++) {
+ final int min = _TestUtil.nextInt(random(), 2, 10);
+ final int max = _TestUtil.nextInt(random(), min, 20);
+
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer,
+ new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+ }
+ };
+ checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
+ }
}
public void testEmptyTerm() throws Exception {
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Oct 11 03:53:42 2013
@@ -96,15 +96,20 @@ public class EdgeNGramTokenizerTest exte
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- };
- checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
- checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
+ for (int i = 0; i < 10; i++) {
+ final int min = _TestUtil.nextInt(random(), 2, 10);
+ final int max = _TestUtil.nextInt(random(), min, 20);
+
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+ checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 20);
+ checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 8192);
+ }
}
public void testTokenizerPositions() throws Exception {
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Fri Oct 11 03:53:42 2013
@@ -144,15 +144,19 @@ public class NGramTokenFilterTest extend
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer,
- new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
- }
- };
- checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
+ for (int i = 0; i < 10; i++) {
+ final int min = _TestUtil.nextInt(random(), 2, 10);
+ final int max = _TestUtil.nextInt(random(), min, 20);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer,
+ new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+ }
+ };
+ checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
+ }
}
public void testEmptyTerm() throws Exception {
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1531186&r1=1531185&r2=1531186&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Oct 11 03:53:42 2013
@@ -107,15 +107,19 @@ public class NGramTokenizerTest extends
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- };
- checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
- checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
+ for (int i = 0; i < 10; i++) {
+ final int min = _TestUtil.nextInt(random(), 2, 10);
+ final int max = _TestUtil.nextInt(random(), min, 20);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+ checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
+ checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 1027);
+ }
}
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {