You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/18 19:12:17 UTC
svn commit: r1578993 - in /lucene/dev/trunk: lucene/
lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/
lucene/analysis/common/src/test/org/apache/lucene/analysis/core/
lucene/analysis/common/src/test/org/apache/lucene/analysis/m...
Author: rmuir
Date: Tue Mar 18 18:12:16 2014
New Revision: 1578993
URL: http://svn.apache.org/r1578993
Log:
LUCENE-5111: Fix WordDelimiterFilter offsets
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java (contents, props changed)
- copied, changed from r1578876, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (contents, props changed)
- copied, changed from r1578886, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Mar 18 18:12:16 2014
@@ -215,6 +215,8 @@ Bug fixes
indexed shapes within 1/2 maxDistErr from the edge of the query shape. This meant
searching for a point by the same point as a query rarely worked. (David Smiley)
+* LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order. (Robert Muir)
+
Test Framework
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
Copied: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java (from r1578876, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java&p1=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java&r1=1578876&r2=1578993&rev=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -32,55 +32,10 @@ import org.apache.lucene.util.RamUsageEs
import java.io.IOException;
/**
- * Splits words into subwords and performs optional transformations on subword
- * groups. Words are split into subwords with the following rules:
- * <ul>
- * <li>split on intra-word delimiters (by default, all non alpha-numeric
- * characters): <code>"Wi-Fi"</code> → <code>"Wi", "Fi"</code></li>
- * <li>split on case transitions: <code>"PowerShot"</code> →
- * <code>"Power", "Shot"</code></li>
- * <li>split on letter-number transitions: <code>"SD500"</code> →
- * <code>"SD", "500"</code></li>
- * <li>leading and trailing intra-word delimiters on each subword are ignored:
- * <code>"//hello---there, 'dude'"</code> →
- * <code>"hello", "there", "dude"</code></li>
- * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code>
- * → <code>"O", "Neil"</code>
- * <ul>
- * <li>Note: this step isn't performed in a separate filter because of possible
- * subword combinations.</li>
- * </ul>
- * </li>
- * </ul>
- *
- * The <b>combinations</b> parameter affects how subwords are combined:
- * <ul>
- * <li>combinations="0" causes no subword combinations: <code>"PowerShot"</code>
- * → <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token positions)</li>
- * <li>combinations="1" means that in addition to the subwords, maximum runs of
- * non-numeric subwords are catenated and produced at the same position of the
- * last subword in the run:
- * <ul>
- * <li><code>"PowerShot"</code> →
- * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li>
- * <li><code>"A's+B's&C's"</code> -gt; <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code>
- * </li>
- * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> →
- * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code>
- * </li>
- * </ul>
- * </li>
- * </ul>
- * One use for {@link WordDelimiterFilter} is to help match words with different
- * subword delimiters. For example, if the source text contained "wi-fi" one may
- * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so
- * is to specify combinations="1" in the analyzer used for indexing, and
- * combinations="0" (the default) in the analyzer used for querying. Given that
- * the current {@link StandardTokenizer} immediately removes many intra-word
- * delimiters, it is recommended that this filter be used after a tokenizer that
- * does not do this (such as {@link WhitespaceTokenizer}).
+ * Old Broken version of {@link WordDelimiterFilter}
*/
-public final class WordDelimiterFilter extends TokenFilter {
+@Deprecated
+public final class Lucene47WordDelimiterFilter extends TokenFilter {
public static final int LOWER = 0x01;
public static final int UPPER = 0x02;
@@ -202,7 +157,7 @@ public final class WordDelimiterFilter e
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+ public Lucene47WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
super(in);
this.flags = configurationFlags;
this.protWords = protWords;
@@ -218,7 +173,7 @@ public final class WordDelimiterFilter e
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
+ public Lucene47WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -27,9 +27,13 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.Version;
import java.io.IOException;
+import java.util.Arrays;
/**
* Splits words into subwords and performs optional transformations on subword
@@ -202,8 +206,11 @@ public final class WordDelimiterFilter e
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+ public WordDelimiterFilter(Version matchVersion, TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
super(in);
+ if (!matchVersion.onOrAfter(Version.LUCENE_48)) {
+ throw new IllegalArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
+ }
this.flags = configurationFlags;
this.protWords = protWords;
this.iterator = new WordDelimiterIterator(
@@ -218,8 +225,8 @@ public final class WordDelimiterFilter e
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
- this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+ public WordDelimiterFilter(Version matchVersion, TokenStream in, int configurationFlags, CharArraySet protWords) {
+ this(matchVersion, in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
}
@Override
@@ -244,6 +251,7 @@ public final class WordDelimiterFilter e
(protWords != null && protWords.contains(termBuffer, 0, termLength))) {
posIncAttribute.setPositionIncrement(accumPosInc);
accumPosInc = 0;
+ first = false;
return true;
}
@@ -265,6 +273,7 @@ public final class WordDelimiterFilter e
if (has(PRESERVE_ORIGINAL)) {
posIncAttribute.setPositionIncrement(accumPosInc);
accumPosInc = 0;
+ first = false;
return true;
}
}
@@ -273,7 +282,8 @@ public final class WordDelimiterFilter e
if (iterator.end == WordDelimiterIterator.DONE) {
if (!concat.isEmpty()) {
if (flushConcatenation(concat)) {
- return true;
+ buffer();
+ continue;
}
}
@@ -281,12 +291,28 @@ public final class WordDelimiterFilter e
// only if we haven't output this same combo above!
if (concatAll.subwordCount > lastConcatCount) {
concatAll.writeAndClear();
- return true;
+ buffer();
+ continue;
}
concatAll.clear();
}
+ if (bufferedPos < bufferedLen) {
+ if (bufferedPos == 0) {
+ sorter.sort(0, bufferedLen);
+ }
+ clearAttributes();
+ restoreState(buffered[bufferedPos++]);
+ if (first && posIncAttribute.getPositionIncrement() == 0) {
+ // can easily happen with strange combinations (e.g. not outputting numbers, but concat-all)
+ posIncAttribute.setPositionIncrement(1);
+ }
+ first = false;
+ return true;
+ }
+
// no saved concatenations, on to the next input word
+ bufferedPos = bufferedLen = 0;
hasSavedState = false;
continue;
}
@@ -295,6 +321,7 @@ public final class WordDelimiterFilter e
if (iterator.isSingleWord()) {
generatePart(true);
iterator.next();
+ first = false;
return true;
}
@@ -304,7 +331,8 @@ public final class WordDelimiterFilter e
if (!concat.isEmpty() && (concat.type & wordType) == 0) {
if (flushConcatenation(concat)) {
hasOutputToken = false;
- return true;
+ buffer();
+ continue;
}
hasOutputToken = false;
}
@@ -325,28 +353,74 @@ public final class WordDelimiterFilter e
// if we should output the word or number part
if (shouldGenerateParts(wordType)) {
generatePart(false);
- iterator.next();
- return true;
+ buffer();
}
iterator.next();
}
}
- /**
- * {@inheritDoc}
- */
@Override
public void reset() throws IOException {
super.reset();
hasSavedState = false;
concat.clear();
concatAll.clear();
- accumPosInc = 0;
+ accumPosInc = bufferedPos = bufferedLen = 0;
+ first = true;
}
// ================================================= Helper Methods ================================================
+
+ private AttributeSource.State buffered[] = new AttributeSource.State[8];
+ private int startOff[] = new int[8];
+ private int posInc[] = new int[8];
+ private int bufferedLen = 0;
+ private int bufferedPos = 0;
+ private boolean first;
+
+ private class OffsetSorter extends InPlaceMergeSorter {
+ @Override
+ protected int compare(int i, int j) {
+ int cmp = Integer.compare(startOff[i], startOff[j]);
+ if (cmp == 0) {
+ cmp = Integer.compare(posInc[j], posInc[i]);
+ }
+ return cmp;
+ }
+
+ @Override
+ protected void swap(int i, int j) {
+ AttributeSource.State tmp = buffered[i];
+ buffered[i] = buffered[j];
+ buffered[j] = tmp;
+
+ int tmp2 = startOff[i];
+ startOff[i] = startOff[j];
+ startOff[j] = tmp2;
+
+ tmp2 = posInc[i];
+ posInc[i] = posInc[j];
+ posInc[j] = tmp2;
+ }
+ }
+
+ final OffsetSorter sorter = new OffsetSorter();
+
+ private void buffer() {
+ if (bufferedLen == buffered.length) {
+ int newSize = ArrayUtil.oversize(bufferedLen+1, 8);
+ buffered = Arrays.copyOf(buffered, newSize);
+ startOff = Arrays.copyOf(startOff, newSize);
+ posInc = Arrays.copyOf(posInc, newSize);
+ }
+ startOff[bufferedLen] = offsetAttribute.startOffset();
+ posInc[bufferedLen] = posIncAttribute.getPositionIncrement();
+ buffered[bufferedLen] = captureState();
+ bufferedLen++;
+ }
+
/**
* Saves the existing attribute states
*/
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java Tue Mar 18 18:12:16 2014
@@ -17,11 +17,13 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.Version;
import java.util.ArrayList;
import java.util.List;
@@ -61,6 +63,7 @@ public class WordDelimiterFilterFactory
/** Creates a new WordDelimiterFilterFactory */
public WordDelimiterFilterFactory(Map<String, String> args) {
super(args);
+ assureMatchVersion();
int flags = 0;
if (getInt(args, "generateWordParts", 1) != 0) {
flags |= GENERATE_WORD_PARTS;
@@ -114,9 +117,14 @@ public class WordDelimiterFilterFactory
}
@Override
- public WordDelimiterFilter create(TokenStream input) {
- return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+ public TokenFilter create(TokenStream input) {
+ if (luceneMatchVersion.onOrAfter(Version.LUCENE_48)) {
+ return new WordDelimiterFilter(luceneMatchVersion, input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
flags, protectedWords);
+ } else {
+ return new Lucene47WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+ flags, protectedWords);
+ }
}
// source => type
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Tue Mar 18 18:12:16 2014
@@ -144,9 +144,7 @@ public class TestRandomChains extends Ba
CachingTokenFilter.class,
// Not broken: we forcefully add this, so we shouldn't
// also randomly pick it:
- ValidatingTokenFilter.class,
- // broken!
- WordDelimiterFilter.class)) {
+ ValidatingTokenFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenConstructors.put(ctor, ALWAYS);
}
@@ -175,7 +173,9 @@ public class TestRandomChains extends Ba
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
- CommonGramsQueryFilter.class)) {
+ CommonGramsQueryFilter.class,
+ // TODO: probably doesnt handle graph inputs, too afraid to try
+ WordDelimiterFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenOffsetsConstructors.put(ctor, ALWAYS);
}
@@ -893,7 +893,26 @@ public class TestRandomChains extends Ba
}
public void testRandomChains() throws Throwable {
- int numIterations = atLeast(10);
+ int numIterations = atLeast(20);
+ Random random = random();
+ for (int i = 0; i < numIterations; i++) {
+ MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
+ if (VERBOSE) {
+ System.out.println("Creating random analyzer:" + a);
+ }
+ try {
+ checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
+ false /* We already validate our own offsets... */);
+ } catch (Throwable e) {
+ System.err.println("Exception from random analyzer: " + a);
+ throw e;
+ }
+ }
+ }
+
+ // we might regret this decision...
+ public void testRandomChainsWithLargeStrings() throws Throwable {
+ int numIterations = atLeast(20);
Random random = random();
for (int i = 0; i < numIterations; i++) {
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
@@ -901,7 +920,7 @@ public class TestRandomChains extends Ba
System.out.println("Creating random analyzer:" + a);
}
try {
- checkRandomData(random, a, 200, 20, false,
+ checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 256, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
System.err.println("Exception from random analyzer: " + a);
Copied: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (from r1578886, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java&p1=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java&r1=1578886&r2=1578993&rev=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -37,7 +37,8 @@ import static org.apache.lucene.analysis
* TODO: should explicitly test things like protWords and not rely on
* the factory tests in Solr.
*/
-public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
+@Deprecated
+public class TestLucene47WordDelimiterFilter extends BaseTokenStreamTestCase {
/***
public void testPerformance() throws IOException {
@@ -62,7 +63,7 @@ public class TestWordDelimiterFilter ext
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
@@ -70,7 +71,7 @@ public class TestWordDelimiterFilter ext
new int[] { 8, 12, 12 },
null, null, null, null, false);
- wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
@@ -82,7 +83,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -93,7 +94,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -104,7 +105,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange3() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -115,7 +116,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar"},
@@ -126,7 +127,7 @@ public class TestWordDelimiterFilter ext
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(keywordMockTokenizer(input),
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
@@ -170,7 +171,7 @@ public class TestWordDelimiterFilter ext
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
- WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
assertTokenStreamContents(wdf, output);
}
@@ -217,7 +218,7 @@ public class TestWordDelimiterFilter ext
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+ return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(
tokenizer,
flags, protWords));
}
@@ -254,7 +255,7 @@ public class TestWordDelimiterFilter ext
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+ return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
}
@@ -301,7 +302,7 @@ public class TestWordDelimiterFilter ext
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
+ return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(filter, flags, protWords));
}
};
@@ -342,7 +343,7 @@ public class TestWordDelimiterFilter ext
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+ return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
checkRandomData(random(), a, 200, 20, false, false);
@@ -364,7 +365,7 @@ public class TestWordDelimiterFilter ext
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+ return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -62,27 +62,25 @@ public class TestWordDelimiterFilter ext
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar" },
- new int[] { 5, 9, 5 },
- new int[] { 8, 12, 12 },
- null, null, null, null, false);
+ new String[] { "foo", "foobar", "bar" },
+ new int[] { 5, 5, 9 },
+ new int[] { 8, 12, 12 });
- wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 5, 5 },
- new int[] { 6, 6, 6 },
- null, null, null, null, false);
+ new int[] { 6, 6, 6 });
}
@Test
public void testOffsetChange() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -93,7 +91,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -104,7 +102,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange3() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -115,18 +113,17 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar"},
- new int[] { 8, 12, 8 },
- new int[] { 11, 15, 15 },
- null, null, null, null, false);
+ new String[] { "foo", "foobar", "bar"},
+ new int[] { 8, 8, 12 },
+ new int[] { 11, 15, 15 });
}
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, keywordMockTokenizer(input),
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
@@ -170,7 +167,7 @@ public class TestWordDelimiterFilter ext
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
- WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, keywordMockTokenizer(input), flags, null);
assertTokenStreamContents(wdf, output);
}
@@ -217,7 +214,7 @@ public class TestWordDelimiterFilter ext
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT,
tokenizer,
flags, protWords));
}
@@ -227,34 +224,25 @@ public class TestWordDelimiterFilter ext
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
- null,
- new int[] { 1, 1 },
- null,
- false);
+ new int[] { 1, 1 });
/* only in this case, posInc of 2 ?! */
- assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
- new int[] { 0, 9, 12, 9 },
+ assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
+ new int[] { 0, 9, 9, 12 },
new int[] { 6, 12, 13, 13 },
- null,
- new int[] { 1, 1, 1, 0 },
- null,
- false);
+ new int[] { 1, 1, 0, 1 });
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
- null,
- new int[] { 1, 1, 1 },
- null,
- false);
+ new int[] { 1, 1, 1 });
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT,
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
}
@@ -264,36 +252,24 @@ public class TestWordDelimiterFilter ext
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
new int[] { 0, 7, 16 },
new int[] { 6, 15, 20 },
- null,
- new int[] { 1, 10, 1 },
- null,
- false);
+ new int[] { 1, 10, 1 });
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
- null,
- new int[] { 1, 11 },
- null,
- false);
+ new int[] { 1, 11 });
/* in this case, the increment of 10 from the "/" is carried over */
- assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
- new int[] { 0, 9, 12, 9 },
+ assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
+ new int[] { 0, 9, 9, 12 },
new int[] { 6, 12, 13, 13 },
- null,
- new int[] { 1, 11, 1, 0 },
- null,
- false);
+ new int[] { 1, 11, 0, 1 });
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
- null,
- new int[] { 1, 11, 1 },
- null,
- false);
+ new int[] { 1, 11, 1 });
Analyzer a3 = new Analyzer() {
@Override
@@ -301,28 +277,62 @@ public class TestWordDelimiterFilter ext
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, filter, flags, protWords));
}
};
assertAnalyzesTo(a3, "lucene.solr",
- new String[] { "lucene", "solr", "lucenesolr" },
- new int[] { 0, 7, 0 },
+ new String[] { "lucene", "lucenesolr", "solr" },
+ new int[] { 0, 0, 7 },
new int[] { 6, 11, 11 },
- null,
- new int[] { 1, 1, 0 },
- null,
- false);
+ new int[] { 1, 0, 1 });
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr",
- new String[] { "lucene", "solr", "lucenesolr" },
- new int[] { 4, 11, 4 },
+ new String[] { "lucene", "lucenesolr", "solr" },
+ new int[] { 4, 4, 11 },
new int[] { 10, 15, 15 },
- null,
- new int[] { 2, 1, 0 },
- null,
- false);
+ new int[] { 2, 0, 1 });
+ }
+
+ /** concat numbers + words + all */
+ public void testLotsOfConcatenating() throws Exception {
+ final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def-123-456",
+ new String[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" },
+ new int[] { 0, 0, 0, 4, 8, 8, 12 },
+ new int[] { 3, 7, 15, 7, 11, 15, 15 },
+ new int[] { 1, 0, 0, 1, 1, 0, 1 });
+ }
+
+ /** concat numbers + words + all + preserve original */
+ public void testLotsOfConcatenating2() throws Exception {
+ final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def-123-456",
+ new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" },
+ new int[] { 0, 0, 0, 0, 4, 8, 8, 12 },
+ new int[] { 15, 3, 7, 15, 7, 11, 15, 15 },
+ new int[] { 1, 0, 0, 0, 1, 1, 0, 1 });
}
/** blast some random strings through the analyzer */
@@ -342,10 +352,34 @@ public class TestWordDelimiterFilter ext
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
+ }
+ };
+ checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ }
+ }
+
+ /** blast some enormous random strings through the analyzer */
+ public void testRandomHugeStrings() throws Exception {
+ int numIterations = atLeast(5);
+ for (int i = 0; i < numIterations; i++) {
+ final int flags = random().nextInt(512);
+ final CharArraySet protectedWords;
+ if (random().nextBoolean()) {
+ protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+ } else {
+ protectedWords = null;
+ }
+
+ Analyzer a = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
}
};
- checkRandomData(random(), a, 200, 20, false, false);
+ checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
}
}
@@ -364,7 +398,7 @@ public class TestWordDelimiterFilter ext
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java Tue Mar 18 18:12:16 2014
@@ -200,6 +200,7 @@ public class TestWordDelimiterFilterFact
String testText = "I borrowed $5,400.00 at 25% interest-rate";
ResourceLoader loader = new SolrResourceLoader("solr/collection1");
Map<String,String> args = new HashMap<>();
+ args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
args.put("generateWordParts", "1");
args.put("generateNumberParts", "1");
args.put("catenateWords", "1");
@@ -213,16 +214,17 @@ public class TestWordDelimiterFilterFact
TokenStream ts = factoryDefault.create(whitespaceMockTokenizer(testText));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
- new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" });
+ new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" });
ts = factoryDefault.create(whitespaceMockTokenizer("foo\u200Dbar"));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
- new String[] { "foo", "bar", "foobar" });
+ new String[] { "foo", "foobar", "bar" });
/* custom behavior */
args = new HashMap<>();
// use a custom type mapping
+ args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
args.put("generateWordParts", "1");
args.put("generateNumberParts", "1");
args.put("catenateWords", "1");
@@ -235,7 +237,7 @@ public class TestWordDelimiterFilterFact
ts = factoryCustom.create(whitespaceMockTokenizer(testText));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
- new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" });
+ new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" });
/* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
ts = factoryCustom.create(whitespaceMockTokenizer("foo\u200Dbar"));
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java Tue Mar 18 18:12:16 2014
@@ -365,8 +365,8 @@ public class FieldAnalysisRequestHandler
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
- assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, new int[]{2,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2}, null, false));
+ assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
@@ -374,8 +374,8 @@ public class FieldAnalysisRequestHandler
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
- assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, new int[]{2,3,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2,2}, null, false));
+ assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("test", null, "word", 14, 18, 5, new int[]{4,5,5}, null, false));
}