You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/04/26 14:14:43 UTC
svn commit: r1476159 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/
lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/
lucene/analysis/common/src/test/...
Author: jpountz
Date: Fri Apr 26 12:14:42 2013
New Revision: 1476159
URL: http://svn.apache.org/r1476159
Log:
LUCENE-4955: Fix NGramTokenizer and NGramTokenFilter, and remove them from TestRandomChains' exclusion list (merged from r1476135).
In addition to the trunk changes, I had to fix SlowSynonymFilterFactory to reset the token stream before consuming it.
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
- copied unchanged from r1476135, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SlowSynonymFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMap.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Fri Apr 26 12:14:42 2013
@@ -5,6 +5,16 @@ http://s.apache.org/luceneversions
======================= Lucene 4.4.0 =======================
+Changes in backwards compatibility policy
+
+* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
+ same position and preserves the position length and the offsets of the
+ original token. (Simon Willnauer, Adrien Grand)
+
+* LUCENE-4955: NGramTokenizer now emits n-grams in a different order
+ (a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
+ whitespaces. (Adrien Grand)
+
Bug Fixes
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
@@ -14,6 +24,9 @@ Bug Fixes
if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with
large heap/explicitly disabled. (Mike McCandless, Uwe Schindler, Robert Muir)
+* LUCENE-4955: NGramTokenizer now supports inputs larger than 1024 chars.
+ (Adrien Grand)
+
Optimizations
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java Fri Apr 26 12:14:42 2013
@@ -47,6 +47,6 @@ public class NGramFilterFactory extends
@Override
public NGramTokenFilter create(TokenStream input) {
- return new NGramTokenFilter(input, minGramSize, maxGramSize);
+ return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Fri Apr 26 12:14:42 2013
@@ -21,37 +21,60 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Version;
/**
* Tokenizes the input into n-grams of the given size(s).
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version} compatibility when
+ * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>emits all n-grams for the same token at the same position,</li>
+ * <li>does not modify offsets,</li>
+ * <li>sorts n-grams by their offset in the original token first, then
+ * increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+ * "c").</li></ul>
+ * <p>You can make this filter use the old behavior by providing a version <
+ * {@link Version#LUCENE_44} in the constructor but this is not recommended as
+ * it will lead to broken {@link TokenStream}s that will cause highlighting
+ * bugs.
*/
public final class NGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
- private int minGram, maxGram;
-
+ private final int minGram, maxGram;
+
private char[] curTermBuffer;
private int curTermLength;
private int curGramSize;
private int curPos;
+ private int curPosInc, curPosLen;
private int tokStart;
- private int tokEnd; // only used if the length changed before this filter
+ private int tokEnd;
private boolean hasIllegalOffsets; // only if the length changed before this filter
-
+
+ private final Version version;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncAtt;
+ private final PositionLengthAttribute posLenAtt;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Creates NGramTokenFilter with given min and max n-grams.
+ * @param version Lucene version to enable correct position increments.
+ * See <a href="#version">above</a> for details.
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
- public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
- super(input);
+ public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
+ super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
+ this.version = version;
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -60,14 +83,37 @@ public final class NGramTokenFilter exte
}
this.minGram = minGram;
this.maxGram = maxGram;
+ if (version.onOrAfter(Version.LUCENE_44)) {
+ posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ posLenAtt = addAttribute(PositionLengthAttribute.class);
+ } else {
+ posIncAtt = new PositionIncrementAttribute() {
+ @Override
+ public void setPositionIncrement(int positionIncrement) {}
+ @Override
+ public int getPositionIncrement() {
+ return 0;
+ }
+ };
+ posLenAtt = new PositionLengthAttribute() {
+ @Override
+ public void setPositionLength(int positionLength) {}
+ @Override
+ public int getPositionLength() {
+ return 0;
+ }
+ };
+ }
}
/**
* Creates NGramTokenFilter with default min and max n-grams.
+ * @param version Lucene version to enable correct position increments.
+ * See <a href="#version">above</a> for details.
* @param input {@link TokenStream} holding the input to be tokenized
*/
- public NGramTokenFilter(TokenStream input) {
- this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+ public NGramTokenFilter(Version version, TokenStream input) {
+ this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
/** Returns the next token in the stream, or null at EOS. */
@@ -82,6 +128,8 @@ public final class NGramTokenFilter exte
curTermLength = termAtt.length();
curGramSize = minGram;
curPos = 0;
+ curPosInc = posIncAtt.getPositionIncrement();
+ curPosLen = posLenAtt.getPositionLength();
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
// if length by start + end offsets doesn't match the term text then assume
@@ -89,20 +137,37 @@ public final class NGramTokenFilter exte
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
}
}
- while (curGramSize <= maxGram) {
- while (curPos+curGramSize <= curTermLength) { // while there is input
+ if (version.onOrAfter(Version.LUCENE_44)) {
+ if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+ ++curPos;
+ curGramSize = minGram;
+ }
+ if (curPos + curGramSize <= curTermLength) {
clearAttributes();
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
- if (hasIllegalOffsets) {
- offsetAtt.setOffset(tokStart, tokEnd);
- } else {
- offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
- }
- curPos++;
+ posIncAtt.setPositionIncrement(curPosInc);
+ curPosInc = 0;
+ posLenAtt.setPositionLength(curPosLen);
+ offsetAtt.setOffset(tokStart, tokEnd);
+ curGramSize++;
return true;
}
- curGramSize++; // increase n-gram size
- curPos = 0;
+ } else {
+ while (curGramSize <= maxGram) {
+ while (curPos+curGramSize <= curTermLength) { // while there is input
+ clearAttributes();
+ termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+ if (hasIllegalOffsets) {
+ offsetAtt.setOffset(tokStart, tokEnd);
+ } else {
+ offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+ }
+ curPos++;
+ return true;
+ }
+ curGramSize++; // increase n-gram size
+ curPos = 0;
+ }
}
curTermBuffer = null;
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Apr 26 12:14:42 2013
@@ -17,63 +17,90 @@ package org.apache.lucene.analysis.ngram
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Reader;
+
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-
-import java.io.IOException;
-import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Version;
/**
* Tokenizes the input into n-grams of the given size(s).
+ * <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
+ * that characters between startOffset and endOffset in the original stream are
+ * the same as the term chars.
+ * <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
+ * <table>
+ * <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
+ * <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
+ * <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
+ * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
+ * </table>
+ * <a name="version"/>
+ * <p>Before Lucene 4.4, this class had a different behavior:<ul>
+ * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
+ * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
+ * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
+ * <p>Although highly discouraged, it is still possible to use the old behavior
+ * through {@link Lucene43NGramTokenizer}.
*/
public final class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
- private int minGram, maxGram;
+ private char[] buffer;
+ private int bufferStart, bufferEnd; // remaining slice of the buffer
+ private int offset;
private int gramSize;
- private int pos;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private String inStr;
- private boolean started;
-
+ private int minGram, maxGram;
+ private boolean exhausted;
+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Creates NGramTokenizer with given min and max n-grams.
+ * @param version the lucene compatibility <a href="#version">version</a>
* @param input {@link Reader} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
- public NGramTokenizer(Reader input, int minGram, int maxGram) {
+ public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
super(input);
- init(minGram, maxGram);
+ init(version, minGram, maxGram);
}
/**
* Creates NGramTokenizer with given min and max n-grams.
+ * @param version the lucene compatibility <a href="#version">version</a>
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
* @param input {@link Reader} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
- public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
+ public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
super(factory, input);
- init(minGram, maxGram);
+ init(version, minGram, maxGram);
}
/**
* Creates NGramTokenizer with default min and max n-grams.
+ * @param version the lucene compatibility <a href="#version">version</a>
* @param input {@link Reader} holding the input to be tokenized
*/
- public NGramTokenizer(Reader input) {
- this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+ public NGramTokenizer(Version version, Reader input) {
+ this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
-
- private void init(int minGram, int maxGram) {
+
+ private void init(Version version, int minGram, int maxGram) {
+ if (!version.onOrAfter(Version.LUCENE_44)) {
+ throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
+ }
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -82,73 +109,66 @@ public final class NGramTokenizer extend
}
this.minGram = minGram;
this.maxGram = maxGram;
+ buffer = new char[maxGram + 1024];
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
- if (!started) {
- started = true;
- gramSize = minGram;
- char[] chars = new char[1024];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- while (charsRead < chars.length) {
- int inc = input.read(chars, charsRead, chars.length-charsRead);
- if (inc == -1) {
- break;
- }
- charsRead += inc;
- }
- inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
- if (charsRead == chars.length) {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- char[] throwaway = new char[1024];
- while(true) {
- final int inc = input.read(throwaway, 0, throwaway.length);
- if (inc == -1) {
+ // compact
+ if (bufferStart >= buffer.length - maxGram) {
+ System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+ bufferEnd -= bufferStart;
+ bufferStart = 0;
+
+ // fill in remaining space
+ if (!exhausted) {
+ // TODO: refactor to a shared readFully
+ while (bufferEnd < buffer.length) {
+ final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
+ if (read == -1) {
+ exhausted = true;
break;
}
- charsRead += inc;
+ bufferEnd += read;
}
}
+ }
- inLen = inStr.length();
- if (inLen == 0) {
- return false;
- }
+ // should we go to the next offset?
+ if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
+ bufferStart++;
+ offset++;
+ gramSize = minGram;
}
- if (pos+gramSize > inLen) { // if we hit the end of the string
- pos = 0; // reset to beginning of string
- gramSize++; // increase n-gram size
- if (gramSize > maxGram) // we are done
- return false;
- if (pos+gramSize > inLen)
- return false;
+ // are there enough chars remaining?
+ if (bufferStart + gramSize > bufferEnd) {
+ return false;
}
- int oldPos = pos;
- pos++;
- termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
- offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
+ termAtt.copyBuffer(buffer, bufferStart, gramSize);
+ posIncAtt.setPositionIncrement(1);
+ posLenAtt.setPositionLength(1);
+ offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
+ ++gramSize;
return true;
}
-
+
@Override
public void end() {
- // set final offset
- final int finalOffset = correctOffset(charsRead);
- this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
-
+ final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+ offsetAtt.setOffset(endOffset, endOffset);
+ }
+
@Override
public void reset() throws IOException {
super.reset();
- started = false;
- pos = 0;
+ bufferStart = bufferEnd = buffer.length;
+ offset = 0;
+ gramSize = minGram;
+ exhausted = false;
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java Fri Apr 26 12:14:42 2013
@@ -18,8 +18,10 @@ package org.apache.lucene.analysis.ngram
*/
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.Version;
import java.io.Reader;
import java.util.Map;
@@ -49,7 +51,11 @@ public class NGramTokenizerFactory exten
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */
@Override
- public NGramTokenizer create(AttributeFactory factory, Reader input) {
- return new NGramTokenizer(factory, input, minGramSize, maxGramSize);
+ public Tokenizer create(AttributeFactory factory, Reader input) {
+ if (luceneMatchVersion.onOrAfter(Version.LUCENE_44)) {
+ return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize);
+ } else {
+ return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
+ }
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SlowSynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SlowSynonymFilterFactory.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SlowSynonymFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SlowSynonymFilterFactory.java Fri Apr 26 12:14:42 2013
@@ -158,6 +158,7 @@ final class SlowSynonymFilterFactory ext
List<String> tokList = new ArrayList<String>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
while (ts.incrementToken()){
if( termAtt.length() > 0 )
tokList.add( termAtt.toString() );
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Fri Apr 26 12:14:42 2013
@@ -54,8 +54,6 @@ import org.apache.lucene.analysis.MockTo
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
-import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
@@ -71,14 +69,14 @@ import org.apache.lucene.analysis.miscel
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
-import org.apache.lucene.analysis.ngram.NGramTokenFilter;
-import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.ngram.Lucene43NGramTokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.payloads.IdentityEncoder;
@@ -90,8 +88,9 @@ import org.apache.lucene.analysis.synony
import org.apache.lucene.analysis.th.ThaiWordFilter;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.Version;
@@ -162,9 +161,7 @@ public class TestRandomChains extends Ba
// startOffset thats > its endOffset
// (see LUCENE-3738 for a list of other offenders here)
// broken!
- NGramTokenizer.class,
- // broken!
- NGramTokenFilter.class,
+ Lucene43NGramTokenizer.class,
// broken!
EdgeNGramTokenizer.class,
// broken!
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Fri Apr 26 12:14:42 2013
@@ -26,7 +26,9 @@ import org.apache.lucene.analysis.Tokeni
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.util.Version;
+import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Random;
@@ -46,7 +48,7 @@ public class NGramTokenFilterTest extend
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
- new NGramTokenFilter(input, 2, 1);
+ new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
@@ -56,50 +58,64 @@ public class NGramTokenFilterTest extend
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
- new NGramTokenFilter(input, 0, 1);
+ new NGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
assertTrue(gotException);
}
-
+
public void testUnigrams() throws Exception {
- NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
- assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1);
+ assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
public void testBigrams() throws Exception {
- NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
- assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2);
+ assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
}
public void testNgrams() throws Exception {
- NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
assertTokenStreamContents(filter,
- new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
- new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
- null, null, null, null, false
+ new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
+ new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
+ new int[]{5,5,5,5,5,5,5,5,5,5,5,5},
+ null,
+ new int[]{1,0,0,0,0,0,0,0,0,0,0,0},
+ null, null, false
);
}
-
+
+ public void testNgramsNoIncrement() throws Exception {
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
+ assertTokenStreamContents(filter,
+ new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
+ new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
+ new int[]{5,5,5,5,5,5,5,5,5,5,5,5},
+ null,
+ new int[]{1,0,0,0,0,0,0,0,0,0,0,0},
+ null, null, false
+ );
+ }
+
public void testOversizedNgrams() throws Exception {
- NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7);
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
}
public void testSmallTokenInStream() throws Exception {
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
- NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
- assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3);
+ assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
- NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
- assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1);
+ assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
tokenizer.setReader(new StringReader("abcde"));
- assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
// LUCENE-3642
@@ -112,14 +128,15 @@ public class NGramTokenFilterTest extend
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
- filters = new NGramTokenFilter(filters, 2, 2);
+ filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær",
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
- new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
+ new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+ new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
}
/** blast some random strings through the analyzer */
@@ -129,7 +146,7 @@ public class NGramTokenFilterTest extend
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
- new NGramTokenFilter(tokenizer, 2, 4));
+ new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
@@ -142,9 +159,22 @@ public class NGramTokenFilterTest extend
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer,
- new NGramTokenFilter(tokenizer, 2, 15));
+ new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
+
+ public void testLucene43() throws IOException {
+ NGramTokenFilter filter = new NGramTokenFilter(Version.LUCENE_43, input, 2, 3);
+ assertTokenStreamContents(filter,
+ new String[]{"ab","bc","cd","de","abc","bcd","cde"},
+ new int[]{0,1,2,3,0,1,2},
+ new int[]{2,3,4,5,3,4,5},
+ null,
+ new int[]{1,1,1,1,1,1,1},
+ null, null, false
+ );
+ }
+
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Apr 26 12:14:42 2013
@@ -18,13 +18,21 @@ package org.apache.lucene.analysis.ngram
*/
+import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.util.LuceneTestCase.Slow;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util._TestUtil;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
/**
* Tests {@link NGramTokenizer} for correctness.
@@ -41,7 +49,7 @@ public class NGramTokenizerTest extends
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
- new NGramTokenizer(input, 2, 1);
+ new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
@@ -51,7 +59,7 @@ public class NGramTokenizerTest extends
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
- new NGramTokenizer(input, 0, 1);
+ new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
@@ -59,21 +67,21 @@ public class NGramTokenizerTest extends
}
public void testUnigrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
public void testBigrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
}
public void testNgrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
assertTokenStreamContents(tokenizer,
- new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
- new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+ new String[]{"a","ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e"},
+ new int[]{0,0,0,1,1,1,2,2,2,3,3,4},
+ new int[]{1,2,3,2,3,4,3,4,5,4,5,5},
null,
null,
null,
@@ -83,12 +91,12 @@ public class NGramTokenizerTest extends
}
public void testOversizedNgrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}
public void testReset() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
@@ -99,11 +107,48 @@ public class NGramTokenizerTest extends
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new NGramTokenizer(reader, 2, 4);
+ Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
}
+
+ private void testNGrams(int minGram, int maxGram, int length) throws IOException {
+ final String s = RandomStrings.randomAsciiOfLength(random(), length);
+ final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
+ final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
+ final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
+ final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
+ final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
+ grams.reset();
+ for (int start = 0; start < s.length(); ++start) {
+ for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
+ assertTrue(grams.incrementToken());
+ assertEquals(s.substring(start, end), termAtt.toString());
+ assertEquals(1, posIncAtt.getPositionIncrement());
+ assertEquals(start, offsetAtt.startOffset());
+ assertEquals(end, offsetAtt.endOffset());
+ }
+ }
+ grams.end();
+ assertEquals(s.length(), offsetAtt.startOffset());
+ assertEquals(s.length(), offsetAtt.endOffset());
+ }
+
+ public void testLargeInput() throws IOException {
+ // test sliding
+ final int minGram = _TestUtil.nextInt(random(), 1, 100);
+ final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+ testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+ }
+
+ public void testLargeMaxGram() throws IOException {
+ // test sliding with maxGram > 1024
+ final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
+ final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
+ testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+ }
+
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java Fri Apr 26 12:14:42 2013
@@ -35,7 +35,7 @@ public class TestNGramFilters extends Ba
Reader reader = new StringReader("test");
TokenStream stream = tokenizerFactory("NGram").create(reader);
assertTokenStreamContents(stream,
- new String[] { "t", "e", "s", "t", "te", "es", "st" });
+ new String[] { "t", "te", "e", "es", "s", "st", "t" });
}
/**
@@ -47,7 +47,7 @@ public class TestNGramFilters extends Ba
"minGramSize", "2",
"maxGramSize", "3").create(reader);
assertTokenStreamContents(stream,
- new String[] { "te", "es", "st", "tes", "est" });
+ new String[] { "te", "tes", "es", "est", "st" });
}
/**
@@ -58,7 +58,7 @@ public class TestNGramFilters extends Ba
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("NGram").create(stream);
assertTokenStreamContents(stream,
- new String[] { "t", "e", "s", "t", "te", "es", "st" });
+ new String[] { "t", "te", "e", "es", "s", "st", "t" });
}
/**
@@ -71,7 +71,7 @@ public class TestNGramFilters extends Ba
"minGramSize", "2",
"maxGramSize", "3").create(stream);
assertTokenStreamContents(stream,
- new String[] { "te", "es", "st", "tes", "est" });
+ new String[] { "te", "tes", "es", "est", "st" });
}
/**
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMap.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMap.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMap.java Fri Apr 26 12:14:42 2013
@@ -27,6 +27,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.ngram.NGramTokenizerFactory;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.util.ResourceLoader;
@@ -248,6 +249,7 @@ public class TestSynonymMap extends Luce
// prepare bi-gram tokenizer factory
Map<String, String> args = new HashMap<String, String>();
+ args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
args.put("minGramSize","2");
args.put("maxGramSize","2");
TokenizerFactory tf = new NGramTokenizerFactory(args);
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java?rev=1476159&r1=1476158&r2=1476159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java Fri Apr 26 12:14:42 2013
@@ -231,7 +231,7 @@ public abstract class LuceneTestCase ext
* Use this constant when creating Analyzers and any other version-dependent stuff.
* <p><b>NOTE:</b> Change this when development starts for new Lucene version:
*/
- public static final Version TEST_VERSION_CURRENT = Version.LUCENE_43;
+ public static final Version TEST_VERSION_CURRENT = Version.LUCENE_44;
/**
* True if and only if tests are run in verbose mode. If this flag is false