You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/26 14:10:08 UTC
svn commit: r916666 [5/16] - in /lucene/java/branches/flex_1458: ./ contrib/
contrib/analyzers/common/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
contrib/analyzers/c...
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Fri Feb 26 13:09:54 2010
@@ -26,6 +26,7 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.util.Version;
public class ShingleFilterTest extends BaseTokenStreamTestCase {
@@ -288,7 +289,360 @@
};
+ public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
+ createToken("please", 0, 6),
+ createToken("please divide this", 0, 18),
+ createToken("divide", 7, 13),
+ createToken("divide this sentence", 7, 27),
+ createToken("this", 14, 18),
+ createToken("this sentence into", 14, 32),
+ createToken("sentence", 19, 27),
+ createToken("sentence into shingles", 19, 39),
+ createToken("into", 28, 32),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
+ "word", "shingle",
+ "word", "shingle",
+ "word", "shingle",
+ "word", "shingle",
+ "word",
+ "word"
+ };
+
+ public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
+ createToken("please divide this", 0, 18),
+ createToken("divide this sentence", 7, 27),
+ createToken("this sentence into", 14, 32),
+ createToken("sentence into shingles", 19, 39)
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
+ 1, 1, 1, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
+ "shingle",
+ "shingle",
+ "shingle",
+ "shingle"
+ };
+
+ public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
+ createToken("please", 0, 6),
+ createToken("please divide this", 0, 18),
+ createToken("please divide this sentence", 0, 27),
+ createToken("divide", 7, 13),
+ createToken("divide this sentence", 7, 27),
+ createToken("divide this sentence into", 7, 32),
+ createToken("this", 14, 18),
+ createToken("this sentence into", 14, 32),
+ createToken("this sentence into shingles", 14, 39),
+ createToken("sentence", 19, 27),
+ createToken("sentence into shingles", 19, 39),
+ createToken("into", 28, 32),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
+ };
+
+ public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word",
+ "word"
+ };
+
+ public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
+ createToken("please divide this", 0, 18),
+ createToken("please divide this sentence", 0, 27),
+ createToken("divide this sentence", 7, 27),
+ createToken("divide this sentence into", 7, 32),
+ createToken("this sentence into", 14, 32),
+ createToken("this sentence into shingles", 14, 39),
+ createToken("sentence into shingles", 19, 39),
+ };
+
+ public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
+ 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle"
+ };
+
+ public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
+ createToken("please", 0, 6),
+ createToken("please divide this sentence", 0, 27),
+ createToken("divide", 7, 13),
+ createToken("divide this sentence into", 7, 32),
+ createToken("this", 14, 18),
+ createToken("this sentence into shingles", 14, 39),
+ createToken("sentence", 19, 27),
+ createToken("into", 28, 32),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 1, 1
+ };
+
+ public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
+ "word", "shingle",
+ "word", "shingle",
+ "word", "shingle",
+ "word",
+ "word",
+ "word"
+ };
+
+ public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
+ createToken("please divide this sentence", 0, 27),
+ createToken("divide this sentence into", 7, 32),
+ createToken("this sentence into shingles", 14, 39),
+ };
+
+ public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
+ 1, 1, 1
+ };
+
+ public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
+ "shingle",
+ "shingle",
+ "shingle"
+ };
+
+ public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
+ createToken("please", 0, 6),
+ createToken("pleasedivide", 0, 13),
+ createToken("divide", 7, 13),
+ createToken("dividethis", 7, 18),
+ createToken("this", 14, 18),
+ createToken("thissentence", 14, 27),
+ createToken("sentence", 19, 27),
+ createToken("sentenceinto", 19, 32),
+ createToken("into", 28, 32),
+ createToken("intoshingles", 28, 39),
+ createToken("shingles", 33, 39),
+ };
+
+ public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
+ "word", "shingle", "word", "shingle", "word", "shingle", "word",
+ "shingle", "word", "shingle", "word"
+ };
+
+ public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
+ createToken("pleasedivide", 0, 13),
+ createToken("dividethis", 7, 18),
+ createToken("thissentence", 14, 27),
+ createToken("sentenceinto", 19, 32),
+ createToken("intoshingles", 28, 39),
+ };
+
+ public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
+ 1, 1, 1, 1, 1
+ };
+
+ public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
+ "shingle", "shingle", "shingle", "shingle", "shingle"
+ };
+
+ public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
+ createToken("please", 0, 6),
+ createToken("pleasedivide", 0, 13),
+ createToken("pleasedividethis", 0, 18),
+ createToken("divide", 7, 13),
+ createToken("dividethis", 7, 18),
+ createToken("dividethissentence", 7, 27),
+ createToken("this", 14, 18),
+ createToken("thissentence", 14, 27),
+ createToken("thissentenceinto", 14, 32),
+ createToken("sentence", 19, 27),
+ createToken("sentenceinto", 19, 32),
+ createToken("sentenceintoshingles", 19, 39),
+ createToken("into", 28, 32),
+ createToken("intoshingles", 28, 39),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word"
+ };
+
+ public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
+ createToken("pleasedivide", 0, 13),
+ createToken("pleasedividethis", 0, 18),
+ createToken("dividethis", 7, 18),
+ createToken("dividethissentence", 7, 27),
+ createToken("thissentence", 14, 27),
+ createToken("thissentenceinto", 14, 32),
+ createToken("sentenceinto", 19, 32),
+ createToken("sentenceintoshingles", 19, 39),
+ createToken("intoshingles", 28, 39),
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle",
+ };
+
+ public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
+ createToken("please", 0, 6),
+ createToken("please<SEP>divide", 0, 13),
+ createToken("divide", 7, 13),
+ createToken("divide<SEP>this", 7, 18),
+ createToken("this", 14, 18),
+ createToken("this<SEP>sentence", 14, 27),
+ createToken("sentence", 19, 27),
+ createToken("sentence<SEP>into", 19, 32),
+ createToken("into", 28, 32),
+ createToken("into<SEP>shingles", 28, 39),
+ createToken("shingles", 33, 39),
+ };
+ public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
+ "word", "shingle", "word", "shingle", "word", "shingle", "word",
+ "shingle", "word", "shingle", "word"
+ };
+
+ public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
+ createToken("please<SEP>divide", 0, 13),
+ createToken("divide<SEP>this", 7, 18),
+ createToken("this<SEP>sentence", 14, 27),
+ createToken("sentence<SEP>into", 19, 32),
+ createToken("into<SEP>shingles", 28, 39),
+ };
+
+ public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
+ 1, 1, 1, 1, 1
+ };
+
+ public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
+ "shingle", "shingle", "shingle", "shingle", "shingle"
+ };
+
+ public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
+ createToken("please", 0, 6),
+ createToken("please<SEP>divide", 0, 13),
+ createToken("please<SEP>divide<SEP>this", 0, 18),
+ createToken("divide", 7, 13),
+ createToken("divide<SEP>this", 7, 18),
+ createToken("divide<SEP>this<SEP>sentence", 7, 27),
+ createToken("this", 14, 18),
+ createToken("this<SEP>sentence", 14, 27),
+ createToken("this<SEP>sentence<SEP>into", 14, 32),
+ createToken("sentence", 19, 27),
+ createToken("sentence<SEP>into", 19, 32),
+ createToken("sentence<SEP>into<SEP>shingles", 19, 39),
+ createToken("into", 28, 32),
+ createToken("into<SEP>shingles", 28, 39),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word"
+ };
+
+ public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
+ createToken("please<SEP>divide", 0, 13),
+ createToken("please<SEP>divide<SEP>this", 0, 18),
+ createToken("divide<SEP>this", 7, 18),
+ createToken("divide<SEP>this<SEP>sentence", 7, 27),
+ createToken("this<SEP>sentence", 14, 27),
+ createToken("this<SEP>sentence<SEP>into", 14, 32),
+ createToken("sentence<SEP>into", 19, 32),
+ createToken("sentence<SEP>into<SEP>shingles", 19, 39),
+ createToken("into<SEP>shingles", 28, 39),
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle",
+ };
+
+ public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
+ createToken("please", 0, 6),
+ createToken("pleasedivide", 0, 13),
+ createToken("pleasedividethis", 0, 18),
+ createToken("divide", 7, 13),
+ createToken("dividethis", 7, 18),
+ createToken("dividethissentence", 7, 27),
+ createToken("this", 14, 18),
+ createToken("thissentence", 14, 27),
+ createToken("thissentenceinto", 14, 32),
+ createToken("sentence", 19, 27),
+ createToken("sentenceinto", 19, 32),
+ createToken("sentenceintoshingles", 19, 39),
+ createToken("into", 28, 32),
+ createToken("intoshingles", 28, 39),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word"
+ };
+
@Override
protected void setUp() throws Exception {
super.setUp();
@@ -379,8 +733,110 @@
}
+ public void testTriGramFilterMinTriGram() throws IOException {
+ this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
+ TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
+ TRI_GRAM_TYPES_MIN_TRI_GRAM,
+ true);
+ }
+
+ public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
+ this.shingleFilterTest(3, 3, TEST_TOKEN,
+ TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+ TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+ TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+ false);
+ }
+
+ public void testFourGramFilterMinTriGram() throws IOException {
+ this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
+ FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
+ FOUR_GRAM_TYPES_MIN_TRI_GRAM,
+ true);
+ }
+
+ public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
+ this.shingleFilterTest(3, 4, TEST_TOKEN,
+ FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+ FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+ FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
+ }
+
+ public void testFourGramFilterMinFourGram() throws IOException {
+ this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
+ FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM,
+ FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
+ true);
+ }
+
+ public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
+ this.shingleFilterTest(4, 4, TEST_TOKEN,
+ FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
+ FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
+ FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
+ }
+
+ public void testBiGramFilterNoSeparator() throws IOException {
+ this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
+ BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
+ BI_GRAM_TYPES_NO_SEPARATOR, true);
+ }
+
+ public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
+ this.shingleFilterTest("", 2, 2, TEST_TOKEN,
+ BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+ BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+ BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+ false);
+ }
+ public void testTriGramFilterNoSeparator() throws IOException {
+ this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
+ TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
+ TRI_GRAM_TYPES_NO_SEPARATOR, true);
+ }
+
+ public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
+ this.shingleFilterTest("", 2, 3, TEST_TOKEN,
+ TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+ TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+ TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
+ }
+
+ public void testBiGramFilterAltSeparator() throws IOException {
+ this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
+ BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
+ BI_GRAM_TYPES_ALT_SEPARATOR, true);
+ }
+
+ public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
+ this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN,
+ BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+ BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+ BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+ false);
+ }
+ public void testTriGramFilterAltSeparator() throws IOException {
+ this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
+ TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
+ TRI_GRAM_TYPES_ALT_SEPARATOR, true);
+ }
+
+ public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
+ this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN,
+ TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+ TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+ TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
+ }
+
+ public void testTriGramFilterNullSeparator() throws IOException {
+ this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
+ TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
+ TRI_GRAM_TYPES_NULL_SEPARATOR, true);
+ }
+
+
public void testReset() throws Exception {
- Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
+ Tokenizer wsTokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("please divide this sentence"));
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
assertTokenStreamContents(filter,
new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
@@ -403,30 +859,50 @@
throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ shingleFilterTestCommon
+ (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ }
+
+ protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
+ Token[] tokensToCompare, int[] positionIncrements,
+ String[] types, boolean outputUnigrams)
+ throws IOException {
+ ShingleFilter filter
+ = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ shingleFilterTestCommon
+ (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ }
+
+ protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
+ Token[] tokensToCompare, int[] positionIncrements,
+ String[] types, boolean outputUnigrams)
+ throws IOException {
+ ShingleFilter filter
+ = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ filter.setTokenSeparator(tokenSeparator);
+ shingleFilterTestCommon
+ (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ }
+
+ protected void shingleFilterTestCommon(ShingleFilter filter,
+ Token[] tokensToCompare,
+ int[] positionIncrements,
+ String[] types, boolean outputUnigrams)
+ throws IOException {
+
filter.setOutputUnigrams(outputUnigrams);
- TermAttribute termAtt = filter.addAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = filter.addAttribute(OffsetAttribute.class);
- PositionIncrementAttribute posIncrAtt = filter.addAttribute(PositionIncrementAttribute.class);
- TypeAttribute typeAtt = filter.addAttribute(TypeAttribute.class);
-
- int i = 0;
- while (filter.incrementToken()) {
- assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
- String termText = termAtt.term();
- String goldText = tokensToCompare[i].term();
- assertEquals("Wrong termText", goldText, termText);
- assertEquals("Wrong startOffset for token \"" + termText + "\"",
- tokensToCompare[i].startOffset(), offsetAtt.startOffset());
- assertEquals("Wrong endOffset for token \"" + termText + "\"",
- tokensToCompare[i].endOffset(), offsetAtt.endOffset());
- assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
- positionIncrements[i], posIncrAtt.getPositionIncrement());
- assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
- i++;
+ String text[] = new String[tokensToCompare.length];
+ int startOffsets[] = new int[tokensToCompare.length];
+ int endOffsets[] = new int[tokensToCompare.length];
+
+ for (int i = 0; i < tokensToCompare.length; i++) {
+ text[i] = tokensToCompare[i].term();
+ startOffsets[i] = tokensToCompare[i].startOffset();
+ endOffsets[i] = tokensToCompare[i].endOffset();
}
- assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
- tokensToCompare.length, i);
+
+ assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
}
private static Token createToken(String term, int start, int offset)
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java Fri Feb 26 13:09:54 2010
@@ -22,11 +22,8 @@
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
-import java.util.HashSet;
-import java.util.Arrays;
import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
@@ -34,6 +31,7 @@
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.util.Version;
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
@@ -43,7 +41,7 @@
public void testIterator() throws IOException {
- WhitespaceTokenizer wst = new WhitespaceTokenizer(new StringReader("one two three four five"));
+ WhitespaceTokenizer wst = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("one two three four five"));
ShingleMatrixFilter smf = new ShingleMatrixFilter(wst, 2, 2, '_', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
int i;
@@ -85,22 +83,12 @@
ts = new ShingleMatrixFilter(tls, 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
-
- assertNext(ts, "please", 0, 6);
- assertNext(ts, "please divide", 0, 13);
- assertNext(ts, "divide", 7, 13);
- assertNext(ts, "divide this", 7, 18);
- assertNext(ts, "this", 14, 18);
- assertNext(ts, "this sentence", 14, 27);
- assertNext(ts, "sentence", 19, 27);
- assertNext(ts, "sentence into", 19, 32);
- assertNext(ts, "into", 28, 32);
- assertNext(ts, "into shingles", 28, 39);
- assertNext(ts, "shingles", 33, 39);
-
-
- assertFalse(ts.incrementToken());
-
+ assertTokenStreamContents(ts,
+ new String[] { "please", "please divide", "divide", "divide this",
+ "this", "this sentence", "sentence", "sentence into", "into",
+ "into shingles", "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33 },
+ new int[] { 6, 13, 13, 18, 18, 27, 27, 32, 32, 39, 39 });
}
/**
@@ -546,6 +534,7 @@
return false;
}
Token prototype = (Token) iterator.next();
+ clearAttributes();
termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
flagsAtt.setFlags(prototype.getFlags());
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -25,6 +25,7 @@
import org.apache.lucene.analysis.TeeSinkTokenFilter;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
+import org.apache.lucene.util.Version;
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
@@ -36,7 +37,7 @@
public void test() throws IOException {
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
- TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+ TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
int count = 0;
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -23,6 +23,7 @@
import org.apache.lucene.analysis.TeeSinkTokenFilter;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
+import org.apache.lucene.util.Version;
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
@@ -34,7 +35,7 @@
public void test() throws IOException {
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
String test = "The quick red fox jumped over the lazy brown dogs";
- TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+ TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
int count = 0;
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -27,6 +27,7 @@
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
@@ -39,7 +40,7 @@
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
String test = "The quick red fox jumped over the lazy brown dogs";
- TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
+ TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))));
SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
boolean seenDogs = false;
Propchange: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/snowball/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Fri Feb 26 13:09:54 2010
@@ -0,0 +1 @@
+data
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java Fri Feb 26 13:09:54 2010
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
/**
* Test the Turkish lowercase filter.
@@ -32,7 +33,7 @@
* Test composed forms
*/
public void testTurkishLowerCaseFilter() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(new StringReader(
+ TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
@@ -43,7 +44,7 @@
* Test decomposed forms
*/
public void testDecomposed() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(new StringReader(
+ TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
@@ -56,7 +57,7 @@
* to U+0130 + U+0316, and is lowercased the same way.
*/
public void testDecomposed2() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(new StringReader(
+ TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Fri Feb 26 13:09:54 2010
@@ -27,12 +27,7 @@
* <p>
* SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
- *
+ * @lucene.experimental
*/
public class AnalyzerProfile {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java Fri Feb 26 13:09:54 2010
@@ -19,11 +19,7 @@
/**
* Internal SmartChineseAnalyzer character type constants.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public class CharType {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Fri Feb 26 13:09:54 2010
@@ -31,11 +31,7 @@
* <p>
* The output tokens can then be broken into words with {@link WordTokenFilter}
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public final class SentenceTokenizer extends Tokenizer {
@@ -134,4 +130,11 @@
super.reset(input);
reset();
}
+
+ @Override
+ public void end() throws IOException {
+ // set final offset
+ final int finalOffset = correctOffset(tokenEnd);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -52,11 +52,7 @@
* The included dictionary data is from <a href="http://www.ictclas.org">ICTCLAS1.0</a>.
* Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public final class SmartChineseAnalyzer extends Analyzer {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java Fri Feb 26 13:09:54 2010
@@ -21,11 +21,7 @@
/**
* SmartChineseAnalyzer utility constants and methods
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public class Utility {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java Fri Feb 26 13:09:54 2010
@@ -26,11 +26,7 @@
/**
* Segment a sentence of Chinese text into words.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class WordSegmenter {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -30,11 +30,7 @@
/**
* A {@link TokenFilter} that breaks sentences into words.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public final class WordTokenFilter extends TokenFilter {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java Fri Feb 26 13:09:54 2010
@@ -19,11 +19,7 @@
/**
* Internal SmartChineseAnalyzer token type constants
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public class WordType {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Fri Feb 26 13:09:54 2010
@@ -26,11 +26,7 @@
* <p>
* Contains methods for dealing with GB2312 encoding.
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
abstract class AbstractDictionary {
/**
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java Fri Feb 26 13:09:54 2010
@@ -30,11 +30,7 @@
* <p>
* For each start offset, a list of possible token pairs is stored.
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class BiSegGraph {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Fri Feb 26 13:09:54 2010
@@ -34,11 +34,7 @@
/**
* SmartChineseAnalyzer Bigram dictionary.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class BigramDictionary extends AbstractDictionary {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java Fri Feb 26 13:09:54 2010
@@ -26,11 +26,7 @@
/**
* Finds the optimal segmentation of a sentence into Chinese words
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public class HHMMSegmenter {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java Fri Feb 26 13:09:54 2010
@@ -22,11 +22,7 @@
* <p>
* Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class PathNode implements Comparable<PathNode> {
public double weight;
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Fri Feb 26 13:09:54 2010
@@ -27,11 +27,7 @@
* <p>
* For each start offset, a list of possible tokens is stored.
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class SegGraph {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java Fri Feb 26 13:09:54 2010
@@ -23,11 +23,7 @@
/**
* SmartChineseAnalyzer internal token
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public class SegToken {
/**
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -25,11 +25,7 @@
* Filters a {@link SegToken} by converting full-width latin to half-width, then lowercasing latin.
* Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
* </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
public class SegTokenFilter {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java Fri Feb 26 13:09:54 2010
@@ -21,11 +21,7 @@
/**
* A pair of tokens in {@link SegGraph}
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class SegTokenPair {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Fri Feb 26 13:09:54 2010
@@ -35,12 +35,7 @@
/**
* SmartChineseAnalyzer Word Dictionary
- *
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
- * The APIs and file formats introduced here might change in the future and will not be
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
*/
class WordDictionary extends AbstractDictionary {
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html Fri Feb 26 13:09:54 2010
@@ -23,11 +23,7 @@
SmartChineseAnalyzer Hidden Markov Model package.
</div>
<div>
-<font color="#FF0000">
-WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. The APIs
-and file formats introduced here might change in the future and will not be supported anymore
-in such a case.
-</font>
+@lucene.experimental
</div>
</body>
</html>
Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html Fri Feb 26 13:09:54 2010
@@ -24,23 +24,19 @@
Analyzer for Simplified Chinese, which indexes words.
</div>
<div>
-<font color="#FF0000">
-WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. The APIs
-and file formats introduced here might change in the future and will not be supported anymore
-in such a case.
-</font>
+@lucene.experimental
</div>
<div>
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
<ul>
- <li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
+ <li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
<li>SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
</ul>
Example phraseï¼ "ææ¯ä¸å½äºº"
<ol>
- <li>ChineseAnalyzer: æï¼æ¯ï¼ä¸ï¼å½ï¼äºº</li>
+ <li>StandardAnalyzer: æï¼æ¯ï¼ä¸ï¼å½ï¼äºº</li>
<li>CJKAnalyzer: ææ¯ï¼æ¯ä¸ï¼ä¸å½ï¼å½äºº</li>
<li>SmartChineseAnalyzer: æï¼æ¯ï¼ä¸å½ï¼äºº</li>
</ol>
Modified: lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java (original)
+++ lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java Fri Feb 26 13:09:54 2010
@@ -89,7 +89,7 @@
/**
*@return The contents value
- *@todo finish this method
+ *TODO: finish this method
*/
public String getContents() {
return contents;
Modified: lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt Fri Feb 26 13:09:54 2010
@@ -2,7 +2,19 @@
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
-$Id:$
+2/21/2020
+ LUCENE-2254: Add support to the quality package for running
+ experiments with any combination of Title, Description, and Narrative.
+ (Robert Muir)
+
+1/28/2010
+ LUCENE-2223: Add a benchmark for ShingleFilter. You can wrap any
+ analyzer with ShingleAnalyzerWrapper and specify shingle parameters
+ with the NewShingleAnalyzer task. (Steven Rowe via Robert Muir)
+
+1/14/2010
+ LUCENE-2210: TrecTopicsReader now properly reads descriptions and
+ narratives from trec topics files. (Robert Muir)
1/11/2010
LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
Modified: lucene/java/branches/flex_1458/contrib/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/build.xml?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/build.xml (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/build.xml Fri Feb 26 13:09:54 2010
@@ -128,10 +128,10 @@
<path id="classpath">
<pathelement path="${common.dir}/build/classes/java"/>
<pathelement path="${common.dir}/build/classes/demo"/>
- <pathelement path="${common.dir}/build/classes/test"/>
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
<pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
<pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
+ <pathelement path="${common.dir}/build/contrib/analyzers/common/classes/java"/>
<fileset dir="lib">
<include name="**/*.jar"/>
</fileset>
@@ -145,7 +145,7 @@
<property name="task.alg" location="conf/micro-standard.alg"/>
<property name="task.mem" value="140M"/>
- <target name="run-task" depends="compile-test,check-files,get-files"
+ <target name="run-task" depends="compile,check-files,get-files"
description="Run compound penalty perf test (optional: -Dtask.alg=your-algorithm-file -Dtask.mem=java-max-mem)">
<echo>Working Directory: ${working.dir}</echo>
<java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="${task.mem}" fork="true">
@@ -193,6 +193,32 @@
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
</target>
+ <property name="shingle.alg.file" location="conf/shingle.alg"/>
+ <property name="shingle.output.file"
+ value="${working.dir}/shingle.benchmark.output.txt"/>
+ <property name="shingle.jira.output.file"
+ value="${working.dir}/shingle.bm2jira.output.txt"/>
+
+ <path id="shingle.runtime.classpath">
+ <path refid="run.classpath"/>
+ </path>
+
+ <target name="shingle" depends="compile,compile-analyzers-common,get-files">
+ <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
+ <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
+ maxmemory="${task.mem}" output="${shingle.output.file}">
+ <classpath refid="run.classpath"/>
+ <arg file="${shingle.alg.file}"/>
+ </java>
+ <echo>Benchmark output is in file: ${shingle.output.file}</echo>
+ <echo>Converting to JIRA table format...</echo>
+ <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
+ <arg value="scripts/shingle.bm2jira.pl"/>
+ <arg value="${shingle.output.file}"/>
+ </exec>
+ <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
+ </target>
+
<target name="compile-demo">
<subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/>
@@ -208,6 +234,11 @@
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
</subant>
</target>
+ <target name="compile-analyzers-common">
+ <subant target="compile">
+ <fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
+ </subant>
+ </target>
<target name="compile-memory">
<subant target="compile">
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
@@ -219,11 +250,6 @@
</subant>
</target>
- <target name="init" depends="common.init,compile-demo,compile-memory,compile-highlighter,compile-vector-highlighter,check-files"/>
-
- <!-- make sure online collections (reuters) are first downloaded -->
- <target name="test" depends="init,get-files">
- <antcall target="common.test" inheritRefs="true" />
- </target>
+ <target name="init" depends="common.init,compile-demo,compile-memory,compile-highlighter,compile-vector-highlighter"/>
</project>
Added: lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg?rev=916666&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg (added)
+++ lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg Fri Feb 26 13:09:54 2010
@@ -0,0 +1,80 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+#
+# based on micro-standard
+#
+# modified to use wikipedia sources and index entire docs
+# currently just used to measure ingest rate
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+work.dir = /x/lucene/wiki.5M
+
+doc.stored=true
+doc.body.stored=false
+doc.tokenized=false
+doc.body.tokenized=true
+doc.term.vector=false
+log.step.AddDoc = 10000
+log.step.Search = 10000
+compound = false
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
+content.source.forever = false
+file.query.maker.file = queries.txt
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
+docs.file = /x/lucene/enwiki-20090306-lines-1k-fixed.txt
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+
+# -------------------------------------------------------------------------------------
+
+# Open a writer
+OpenIndex
+{
+ # Get a new near-real-time reader, once per second:
+ NearRealtimeReader(1.0) &
+
+ # Warm
+ Search
+
+ # Index with 2 threads, each adding 100 docs per sec
+ [ "Indexing" { AddDoc > : * : 100/sec ] : 2 &
+
+ # Redline search (from queries.txt) with 4 threads
+ [ "Searching" { Search > : * ] : 4 &
+
+ # Wait 60 sec, then wrap up
+ Wait(5.0)
+}
+CloseReader
+
+# Don't keep any changes, so we can re-test on the same index again
+RollbackIndex
+
+RepSumByPref Indexing
+RepSumByPref Searching
+RepSumByPref NearRealtimeReader
+
+
Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg Fri Feb 26 13:09:54 2010
@@ -1,97 +1,97 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
-content.source.encoding=UTF-8
-doc.tokenized=false
-doc.body.tokenized=true
-docs.file=work/top100k-out/top.fr.wikipedia.words.txt
-content.source.forever=false
-log.step=100000
-
-{ "Rounds"
- -NewAnalyzer(KeywordAnalyzer)
- -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
- ResetInputs
- { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
-
- -NewAnalyzer(KeywordAnalyzer)
- -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
- ResetInputs
- { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
-
- -NewAnalyzer(KeywordAnalyzer)
- -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
- ResetInputs
- { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
-
- -NewAnalyzer(KeywordAnalyzer)
- -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
- ResetInputs
- { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(fr)
- -NewCollationAnalyzer
- -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
- ResetInputs
- { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(de)
- -NewCollationAnalyzer
- -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
- ResetInputs
- { "GermanJDK" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(uk)
- -NewCollationAnalyzer
- -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
- ResetInputs
- { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(en)
- -NewCollationAnalyzer
- -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
- ResetInputs
- { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(fr)
- -NewCollationAnalyzer(impl:icu)
- -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
- ResetInputs
- { "FrenchICU" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(de)
- -NewCollationAnalyzer(impl:icu)
- -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
- ResetInputs
- { "GermanICU" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(uk)
- -NewCollationAnalyzer(impl:icu)
- -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
- ResetInputs
- { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
-
- -NewLocale(en)
- -NewCollationAnalyzer(impl:icu)
- -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
- ResetInputs
- { "EnglishICU" { ReadTokens > : * ResetInputs } : 10
-
- NewRound
-
-} : 5
-
-RepSumByNameRound
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
+content.source.encoding=UTF-8
+doc.tokenized=false
+doc.body.tokenized=true
+docs.file=work/top100k-out/top.fr.wikipedia.words.txt
+content.source.forever=false
+log.step=100000
+
+{ "Rounds"
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+ ResetInputs
+ { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+ ResetInputs
+ { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+ ResetInputs
+ { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewAnalyzer(KeywordAnalyzer)
+ -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+ ResetInputs
+ { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(fr)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+ ResetInputs
+ { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(de)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+ ResetInputs
+ { "GermanJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(uk)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+ ResetInputs
+ { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(en)
+ -NewCollationAnalyzer
+ -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+ ResetInputs
+ { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(fr)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+ ResetInputs
+ { "FrenchICU" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(de)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+ ResetInputs
+ { "GermanICU" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(uk)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+ ResetInputs
+ { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
+
+ -NewLocale(en)
+ -NewCollationAnalyzer(impl:icu)
+ -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+ ResetInputs
+ { "EnglishICU" { ReadTokens > : * ResetInputs } : 10
+
+ NewRound
+
+} : 5
+
+RepSumByNameRound
Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/highlight-vs-vector-highlight.alg
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/vector-highlight-profile.alg
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java Fri Feb 26 13:09:54 2010
@@ -29,8 +29,7 @@
* single query (body: 1) sorted by docdate, and prints
* time to reopen and time to run the search.
*
- * <b>NOTE</b>: this is very experimental at this point, and
- * subject to change. It's also not generally usable, eg
+ * @lucene.experimental It's also not generally usable, eg
* you cannot change which query is executed.
*/
public class NearRealtimeReaderTask extends PerfTask {
@@ -92,6 +91,7 @@
r = newReader;
}
}
+ stopNow = false;
return reopenCount;
}
Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Fri Feb 26 13:09:54 2010
@@ -134,7 +134,6 @@
* @return number of work items done by this task.
*/
public final int runAndMaybeStats(boolean reportStats) throws Exception {
- stopNow = false;
if (!reportStats || shouldNotRecordStats()) {
setup();
int count = doLogic();
Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java Fri Feb 26 13:09:54 2010
@@ -230,6 +230,8 @@
getRunData().getPoints().getCurrentStats().setCountsByTime(countsByTime, logByTimeMsec);
}
+ stopNow = false;
+
return count;
}
@@ -276,6 +278,7 @@
}
}
}
+ stopNow = false;
return count;
}
Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java Fri Feb 26 13:09:54 2010
@@ -30,6 +30,8 @@
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Set;
/**
@@ -38,12 +40,14 @@
**/
public class QueryDriver {
public static void main(String[] args) throws Exception {
- if (args.length != 4) {
- System.err.println("Usage: QueryDriver <topicsFile> <qrelsFile> <submissionFile> <indexDir>");
+ if (args.length < 4 || args.length > 5) {
+ System.err.println("Usage: QueryDriver <topicsFile> <qrelsFile> <submissionFile> <indexDir> [querySpec]");
System.err.println("topicsFile: input file containing queries");
System.err.println("qrelsFile: input file containing relevance judgements");
System.err.println("submissionFile: output submission file for trec_eval");
System.err.println("indexDir: index directory");
+ System.err.println("querySpec: string composed of fields to use in query consisting of T=title,D=description,N=narrative:");
+ System.err.println("\texample: TD (query on Title + Description). The default is T (title only)");
System.exit(1);
}
@@ -51,6 +55,7 @@
File qrelsFile = new File(args[1]);
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene");
FSDirectory dir = FSDirectory.open(new File(args[3]));
+ String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
Searcher searcher = new IndexSearcher(dir, true);
int maxResults = 1000;
@@ -68,8 +73,13 @@
// validate topics & judgments match each other
judge.validateData(qqs, logger);
+ Set<String> fieldSet = new HashSet<String>();
+ if (fieldSpec.indexOf('T') >= 0) fieldSet.add("title");
+ if (fieldSpec.indexOf('D') >= 0) fieldSet.add("description");
+ if (fieldSpec.indexOf('N') >= 0) fieldSet.add("narrative");
+
// set the parsing of quality queries into Lucene queries.
- QualityQueryParser qqParser = new SimpleQQParser("title", "body");
+ QualityQueryParser qqParser = new SimpleQQParser(fieldSet.toArray(new String[0]), "body");
// run the benchmark
QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java Fri Feb 26 13:09:54 2010
@@ -76,16 +76,31 @@
k = sb.indexOf(">");
String title = sb.substring(k+1).trim();
// description
- sb = read(reader,"<desc>",null,false,false);
- sb = read(reader,"<narr>",null,false,true);
- String descripion = sb.toString().trim();
+ read(reader,"<desc>",null,false,false);
+ sb.setLength(0);
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ if (line.startsWith("<narr>"))
+ break;
+ if (sb.length() > 0) sb.append(' ');
+ sb.append(line);
+ }
+ String description = sb.toString().trim();
+ // narrative
+ sb.setLength(0);
+ while ((line = reader.readLine()) != null) {
+ if (line.startsWith("</top>"))
+ break;
+ if (sb.length() > 0) sb.append(' ');
+ sb.append(line);
+ }
+ String narrative = sb.toString().trim();
// we got a topic!
fields.put("title",title);
- fields.put("description",descripion);
+ fields.put("description",description);
+ fields.put("narrative", narrative);
QualityQuery topic = new QualityQuery(id,fields);
res.add(topic);
- // skip narrative, get to end of doc
- read(reader,"</top>",null,false,false);
}
} finally {
reader.close();
Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java Fri Feb 26 13:09:54 2010
@@ -21,27 +21,38 @@
import org.apache.lucene.benchmark.quality.QualityQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
/**
* Simplistic quality query parser. A Lucene query is created by passing
- * the value of the specified QualityQuery name-value pair into
+ * the value of the specified QualityQuery name-value pair(s) into
* a Lucene's QueryParser using StandardAnalyzer. */
public class SimpleQQParser implements QualityQueryParser {
- private String qqName;
+ private String qqNames[];
private String indexField;
ThreadLocal<QueryParser> queryParser = new ThreadLocal<QueryParser>();
/**
* Constructor of a simple qq parser.
+ * @param qqNames name-value pairs of quality query to use for creating the query
+ * @param indexField corresponding index field
+ */
+ public SimpleQQParser(String qqNames[], String indexField) {
+ this.qqNames = qqNames;
+ this.indexField = indexField;
+ }
+
+ /**
+ * Constructor of a simple qq parser.
* @param qqName name-value pair of quality query to use for creating the query
* @param indexField corresponding index field
*/
public SimpleQQParser(String qqName, String indexField) {
- this.qqName = qqName;
- this.indexField = indexField;
+ this(new String[] { qqName }, indexField);
}
/* (non-Javadoc)
@@ -53,7 +64,11 @@
qp = new QueryParser(Version.LUCENE_CURRENT, indexField, new StandardAnalyzer(Version.LUCENE_CURRENT));
queryParser.set(qp);
}
- return qp.parse(qq.getValue(qqName));
+ BooleanQuery bq = new BooleanQuery();
+ for (int i = 0; i < qqNames.length; i++)
+ bq.add(qp.parse(QueryParser.escape(qq.getValue(qqNames[i]))), BooleanClause.Occur.SHOULD);
+
+ return bq;
}
}