You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/19 00:41:11 UTC
svn commit: r1579089 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/
lucene/analysis/common/src/test/org/apache/lucene/analysis/core/
lucene/analysis/common/src/...
Author: rmuir
Date: Tue Mar 18 23:41:11 2014
New Revision: 1579089
URL: http://svn.apache.org/r1579089
Log:
LUCENE-5111: Fix WordDelimiterFilter offsets
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java (props changed)
- copied unchanged from r1578993, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (contents, props changed)
- copied, changed from r1578993, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Mar 18 23:41:11 2014
@@ -162,6 +162,8 @@ Bug fixes
indexed shapes within 1/2 maxDistErr from the edge of the query shape. This meant
searching for a point by the same point as a query rarely worked. (David Smiley)
+* LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order. (Robert Muir)
+
Test Framework
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java Tue Mar 18 23:41:11 2014
@@ -27,9 +27,13 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.Version;
import java.io.IOException;
+import java.util.Arrays;
/**
* Splits words into subwords and performs optional transformations on subword
@@ -202,8 +206,11 @@ public final class WordDelimiterFilter e
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+ public WordDelimiterFilter(Version matchVersion, TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
super(in);
+ if (!matchVersion.onOrAfter(Version.LUCENE_48)) {
+ throw new IllegalArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
+ }
this.flags = configurationFlags;
this.protWords = protWords;
this.iterator = new WordDelimiterIterator(
@@ -218,8 +225,8 @@ public final class WordDelimiterFilter e
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
- this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+ public WordDelimiterFilter(Version matchVersion, TokenStream in, int configurationFlags, CharArraySet protWords) {
+ this(matchVersion, in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
}
@Override
@@ -244,6 +251,7 @@ public final class WordDelimiterFilter e
(protWords != null && protWords.contains(termBuffer, 0, termLength))) {
posIncAttribute.setPositionIncrement(accumPosInc);
accumPosInc = 0;
+ first = false;
return true;
}
@@ -265,6 +273,7 @@ public final class WordDelimiterFilter e
if (has(PRESERVE_ORIGINAL)) {
posIncAttribute.setPositionIncrement(accumPosInc);
accumPosInc = 0;
+ first = false;
return true;
}
}
@@ -273,7 +282,8 @@ public final class WordDelimiterFilter e
if (iterator.end == WordDelimiterIterator.DONE) {
if (!concat.isEmpty()) {
if (flushConcatenation(concat)) {
- return true;
+ buffer();
+ continue;
}
}
@@ -281,12 +291,28 @@ public final class WordDelimiterFilter e
// only if we haven't output this same combo above!
if (concatAll.subwordCount > lastConcatCount) {
concatAll.writeAndClear();
- return true;
+ buffer();
+ continue;
}
concatAll.clear();
}
+ if (bufferedPos < bufferedLen) {
+ if (bufferedPos == 0) {
+ sorter.sort(0, bufferedLen);
+ }
+ clearAttributes();
+ restoreState(buffered[bufferedPos++]);
+ if (first && posIncAttribute.getPositionIncrement() == 0) {
+ // can easily happen with strange combinations (e.g. not outputting numbers, but concat-all)
+ posIncAttribute.setPositionIncrement(1);
+ }
+ first = false;
+ return true;
+ }
+
// no saved concatenations, on to the next input word
+ bufferedPos = bufferedLen = 0;
hasSavedState = false;
continue;
}
@@ -295,6 +321,7 @@ public final class WordDelimiterFilter e
if (iterator.isSingleWord()) {
generatePart(true);
iterator.next();
+ first = false;
return true;
}
@@ -304,7 +331,8 @@ public final class WordDelimiterFilter e
if (!concat.isEmpty() && (concat.type & wordType) == 0) {
if (flushConcatenation(concat)) {
hasOutputToken = false;
- return true;
+ buffer();
+ continue;
}
hasOutputToken = false;
}
@@ -325,28 +353,74 @@ public final class WordDelimiterFilter e
// if we should output the word or number part
if (shouldGenerateParts(wordType)) {
generatePart(false);
- iterator.next();
- return true;
+ buffer();
}
iterator.next();
}
}
- /**
- * {@inheritDoc}
- */
@Override
public void reset() throws IOException {
super.reset();
hasSavedState = false;
concat.clear();
concatAll.clear();
- accumPosInc = 0;
+ accumPosInc = bufferedPos = bufferedLen = 0;
+ first = true;
}
// ================================================= Helper Methods ================================================
+
+ private AttributeSource.State buffered[] = new AttributeSource.State[8];
+ private int startOff[] = new int[8];
+ private int posInc[] = new int[8];
+ private int bufferedLen = 0;
+ private int bufferedPos = 0;
+ private boolean first;
+
+ private class OffsetSorter extends InPlaceMergeSorter {
+ @Override
+ protected int compare(int i, int j) {
+ int cmp = Integer.compare(startOff[i], startOff[j]);
+ if (cmp == 0) {
+ cmp = Integer.compare(posInc[j], posInc[i]);
+ }
+ return cmp;
+ }
+
+ @Override
+ protected void swap(int i, int j) {
+ AttributeSource.State tmp = buffered[i];
+ buffered[i] = buffered[j];
+ buffered[j] = tmp;
+
+ int tmp2 = startOff[i];
+ startOff[i] = startOff[j];
+ startOff[j] = tmp2;
+
+ tmp2 = posInc[i];
+ posInc[i] = posInc[j];
+ posInc[j] = tmp2;
+ }
+ }
+
+ final OffsetSorter sorter = new OffsetSorter();
+
+ private void buffer() {
+ if (bufferedLen == buffered.length) {
+ int newSize = ArrayUtil.oversize(bufferedLen+1, 8);
+ buffered = Arrays.copyOf(buffered, newSize);
+ startOff = Arrays.copyOf(startOff, newSize);
+ posInc = Arrays.copyOf(posInc, newSize);
+ }
+ startOff[bufferedLen] = offsetAttribute.startOffset();
+ posInc[bufferedLen] = posIncAttribute.getPositionIncrement();
+ buffered[bufferedLen] = captureState();
+ bufferedLen++;
+ }
+
/**
* Saves the existing attribute states
*/
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java Tue Mar 18 23:41:11 2014
@@ -17,11 +17,13 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.Version;
import java.util.ArrayList;
import java.util.List;
@@ -61,6 +63,7 @@ public class WordDelimiterFilterFactory
/** Creates a new WordDelimiterFilterFactory */
public WordDelimiterFilterFactory(Map<String, String> args) {
super(args);
+ assureMatchVersion();
int flags = 0;
if (getInt(args, "generateWordParts", 1) != 0) {
flags |= GENERATE_WORD_PARTS;
@@ -114,9 +117,14 @@ public class WordDelimiterFilterFactory
}
@Override
- public WordDelimiterFilter create(TokenStream input) {
- return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+ public TokenFilter create(TokenStream input) {
+ if (luceneMatchVersion.onOrAfter(Version.LUCENE_48)) {
+ return new WordDelimiterFilter(luceneMatchVersion, input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
flags, protectedWords);
+ } else {
+ return new Lucene47WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+ flags, protectedWords);
+ }
}
// source => type
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Tue Mar 18 23:41:11 2014
@@ -146,9 +146,7 @@ public class TestRandomChains extends Ba
CachingTokenFilter.class,
// Not broken: we forcefully add this, so we shouldn't
// also randomly pick it:
- ValidatingTokenFilter.class,
- // broken!
- WordDelimiterFilter.class)) {
+ ValidatingTokenFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenConstructors.put(ctor, ALWAYS);
}
@@ -177,7 +175,9 @@ public class TestRandomChains extends Ba
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
- CommonGramsQueryFilter.class)) {
+ CommonGramsQueryFilter.class,
+ // TODO: probably doesnt handle graph inputs, too afraid to try
+ WordDelimiterFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenOffsetsConstructors.put(ctor, ALWAYS);
}
@@ -914,7 +914,26 @@ public class TestRandomChains extends Ba
}
public void testRandomChains() throws Throwable {
- int numIterations = atLeast(10);
+ int numIterations = atLeast(20);
+ Random random = random();
+ for (int i = 0; i < numIterations; i++) {
+ MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
+ if (VERBOSE) {
+ System.out.println("Creating random analyzer:" + a);
+ }
+ try {
+ checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
+ false /* We already validate our own offsets... */);
+ } catch (Throwable e) {
+ System.err.println("Exception from random analyzer: " + a);
+ throw e;
+ }
+ }
+ }
+
+ // we might regret this decision...
+ public void testRandomChainsWithLargeStrings() throws Throwable {
+ int numIterations = atLeast(20);
Random random = random();
for (int i = 0; i < numIterations; i++) {
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
@@ -922,7 +941,7 @@ public class TestRandomChains extends Ba
System.out.println("Creating random analyzer:" + a);
}
try {
- checkRandomData(random, a, 200, 20, false,
+ checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 256, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
System.err.println("Exception from random analyzer: " + a);
Copied: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (from r1578993, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java&p1=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java&r1=1578993&r2=1579089&rev=1579089&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java Tue Mar 18 23:41:11 2014
@@ -27,6 +27,8 @@ import org.apache.lucene.analysis.util.C
import org.junit.Test;
import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
import java.util.*;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
@@ -127,7 +129,8 @@ public class TestLucene47WordDelimiterFi
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- TokenFilter wdf = new Lucene47WordDelimiterFilter(keywordMockTokenizer(input),
+ MockTokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(tokenizer,
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
@@ -171,7 +174,8 @@ public class TestLucene47WordDelimiterFi
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
- TokenFilter wdf = new Lucene47WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
+ MockTokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false);
+ TokenFilter wdf = new Lucene47WordDelimiterFilter(tokenizer, flags, null);
assertTokenStreamContents(wdf, output);
}
@@ -216,8 +220,8 @@ public class TestLucene47WordDelimiterFi
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
- public TokenStreamComponents createComponents(String field) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ public TokenStreamComponents createComponents(String field, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(
tokenizer,
flags, protWords));
@@ -253,8 +257,8 @@ public class TestLucene47WordDelimiterFi
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
- public TokenStreamComponents createComponents(String field) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ public TokenStreamComponents createComponents(String field, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
@@ -298,8 +302,8 @@ public class TestLucene47WordDelimiterFi
Analyzer a3 = new Analyzer() {
@Override
- public TokenStreamComponents createComponents(String field) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ public TokenStreamComponents createComponents(String field, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(filter, flags, protWords));
@@ -341,8 +345,8 @@ public class TestLucene47WordDelimiterFi
Analyzer a = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
@@ -363,8 +367,8 @@ public class TestLucene47WordDelimiterFi
Analyzer a = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new KeywordTokenizer();
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Tue Mar 18 23:41:11 2014
@@ -66,27 +66,25 @@ public class TestWordDelimiterFilter ext
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar" },
- new int[] { 5, 9, 5 },
- new int[] { 8, 12, 12 },
- null, null, null, null, false);
+ new String[] { "foo", "foobar", "bar" },
+ new int[] { 5, 5, 9 },
+ new int[] { 8, 12, 12 });
- wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 5, 5 },
- new int[] { 6, 6, 6 },
- null, null, null, null, false);
+ new int[] { 6, 6, 6 });
}
@Test
public void testOffsetChange() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -97,7 +95,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -108,7 +106,7 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange3() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -119,18 +117,17 @@ public class TestWordDelimiterFilter ext
@Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar"},
- new int[] { 8, 12, 8 },
- new int[] { 11, 15, 15 },
- null, null, null, null, false);
+ new String[] { "foo", "foobar", "bar"},
+ new int[] { 8, 8, 12 },
+ new int[] { 11, 15, 15 });
}
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(
new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
@@ -174,7 +171,7 @@ public class TestWordDelimiterFilter ext
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
- WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
+ WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(
new StringReader(input), MockTokenizer.KEYWORD, false), flags, null);
assertTokenStreamContents(wdf, output);
@@ -222,7 +219,7 @@ public class TestWordDelimiterFilter ext
@Override
public TokenStreamComponents createComponents(String field, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT,
tokenizer,
flags, protWords));
}
@@ -232,34 +229,25 @@ public class TestWordDelimiterFilter ext
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
- null,
- new int[] { 1, 1 },
- null,
- false);
+ new int[] { 1, 1 });
/* only in this case, posInc of 2 ?! */
- assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
- new int[] { 0, 9, 12, 9 },
+ assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
+ new int[] { 0, 9, 9, 12 },
new int[] { 6, 12, 13, 13 },
- null,
- new int[] { 1, 1, 1, 0 },
- null,
- false);
+ new int[] { 1, 1, 0, 1 });
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
- null,
- new int[] { 1, 1, 1 },
- null,
- false);
+ new int[] { 1, 1, 1 });
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT,
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
}
@@ -269,36 +257,24 @@ public class TestWordDelimiterFilter ext
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
new int[] { 0, 7, 16 },
new int[] { 6, 15, 20 },
- null,
- new int[] { 1, 10, 1 },
- null,
- false);
+ new int[] { 1, 10, 1 });
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
- null,
- new int[] { 1, 11 },
- null,
- false);
+ new int[] { 1, 11 });
/* in this case, the increment of 10 from the "/" is carried over */
- assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
- new int[] { 0, 9, 12, 9 },
+ assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
+ new int[] { 0, 9, 9, 12 },
new int[] { 6, 12, 13, 13 },
- null,
- new int[] { 1, 11, 1, 0 },
- null,
- false);
+ new int[] { 1, 11, 0, 1 });
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
- null,
- new int[] { 1, 11, 1 },
- null,
- false);
+ new int[] { 1, 11, 1 });
Analyzer a3 = new Analyzer() {
@Override
@@ -306,28 +282,62 @@ public class TestWordDelimiterFilter ext
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, filter, flags, protWords));
}
};
assertAnalyzesTo(a3, "lucene.solr",
- new String[] { "lucene", "solr", "lucenesolr" },
- new int[] { 0, 7, 0 },
+ new String[] { "lucene", "lucenesolr", "solr" },
+ new int[] { 0, 0, 7 },
new int[] { 6, 11, 11 },
- null,
- new int[] { 1, 1, 0 },
- null,
- false);
+ new int[] { 1, 0, 1 });
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr",
- new String[] { "lucene", "solr", "lucenesolr" },
- new int[] { 4, 11, 4 },
+ new String[] { "lucene", "lucenesolr", "solr" },
+ new int[] { 4, 4, 11 },
new int[] { 10, 15, 15 },
- null,
- new int[] { 2, 1, 0 },
- null,
- false);
+ new int[] { 2, 0, 1 });
+ }
+
+ /** concat numbers + words + all */
+ public void testLotsOfConcatenating() throws Exception {
+ final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def-123-456",
+ new String[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" },
+ new int[] { 0, 0, 0, 4, 8, 8, 12 },
+ new int[] { 3, 7, 15, 7, 11, 15, 15 },
+ new int[] { 1, 0, 0, 1, 1, 0, 1 });
+ }
+
+ /** concat numbers + words + all + preserve original */
+ public void testLotsOfConcatenating2() throws Exception {
+ final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def-123-456",
+ new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" },
+ new int[] { 0, 0, 0, 0, 4, 8, 8, 12 },
+ new int[] { 15, 3, 7, 15, 7, 11, 15, 15 },
+ new int[] { 1, 0, 0, 0, 1, 1, 0, 1 });
}
/** blast some random strings through the analyzer */
@@ -347,10 +357,34 @@ public class TestWordDelimiterFilter ext
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
+ }
+ };
+ checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ }
+ }
+
+ /** blast some enormous random strings through the analyzer */
+ public void testRandomHugeStrings() throws Exception {
+ int numIterations = atLeast(5);
+ for (int i = 0; i < numIterations; i++) {
+ final int flags = random().nextInt(512);
+ final CharArraySet protectedWords;
+ if (random().nextBoolean()) {
+ protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+ } else {
+ protectedWords = null;
+ }
+
+ Analyzer a = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
}
};
- checkRandomData(random(), a, 200, 20, false, false);
+ checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
}
}
@@ -369,7 +403,7 @@ public class TestWordDelimiterFilter ext
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
- return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java Tue Mar 18 23:41:11 2014
@@ -200,6 +200,7 @@ public class TestWordDelimiterFilterFact
String testText = "I borrowed $5,400.00 at 25% interest-rate";
ResourceLoader loader = new SolrResourceLoader("solr/collection1");
Map<String,String> args = new HashMap<>();
+ args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
args.put("generateWordParts", "1");
args.put("generateNumberParts", "1");
args.put("catenateWords", "1");
@@ -214,17 +215,18 @@ public class TestWordDelimiterFilterFact
TokenStream ts = factoryDefault.create(
new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
- new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" });
+ new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" });
ts = factoryDefault.create(
new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
- new String[] { "foo", "bar", "foobar" });
+ new String[] { "foo", "foobar", "bar" });
/* custom behavior */
args = new HashMap<>();
// use a custom type mapping
+ args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
args.put("generateWordParts", "1");
args.put("generateNumberParts", "1");
args.put("catenateWords", "1");
@@ -238,7 +240,7 @@ public class TestWordDelimiterFilterFact
ts = factoryCustom.create(
new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
- new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" });
+ new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" });
/* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
ts = factoryCustom.create(
Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java?rev=1579089&r1=1579088&r2=1579089&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java Tue Mar 18 23:41:11 2014
@@ -365,8 +365,8 @@ public class FieldAnalysisRequestHandler
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
- assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, new int[]{2,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2}, null, false));
+ assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
@@ -374,8 +374,8 @@ public class FieldAnalysisRequestHandler
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
- assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, new int[]{2,3,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2,2}, null, false));
+ assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("test", null, "word", 14, 18, 5, new int[]{4,5,5}, null, false));
}