You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/05/14 15:51:59 UTC
svn commit: r1103112 [18/24] - in /lucene/dev/branches/flexscoring: ./
dev-tools/eclipse/ dev-tools/idea/.idea/ dev-tools/idea/lucene/contrib/ant/
dev-tools/idea/lucene/contrib/db/bdb-je/
dev-tools/idea/lucene/contrib/db/bdb/ dev-tools/idea/lucene/cont...
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestIndonesianAnalyzer exte
checkOneTermReuse(a, "peledakan", "peledakan");
checkOneTermReuse(a, "pembunuhan", "bunuh");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new IndonesianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java Sat May 14 13:51:35 2011
@@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
@@ -50,4 +51,23 @@ public class TestItalianAnalyzer extends
checkOneTermReuse(a, "abbandonata", "abbandonata");
checkOneTermReuse(a, "abbandonati", "abbandon");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
+
+ /** test that the elisionfilter is working */
+ public void testContractions() throws IOException {
+ Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+ }
+
+ /** test that we don't enable this before 3.2*/
+ public void testContractionsBackwards() throws IOException {
+ Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java Sat May 14 13:51:35 2011
@@ -45,4 +45,9 @@ public class TestItalianLightStemFilter
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java Sat May 14 13:51:35 2011
@@ -51,7 +51,7 @@ public class TestLimitTokenCountAnalyzer
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new LimitTokenCountAnalyzer(new MockAnalyzer(), 100000)));
+ TEST_VERSION_CURRENT, new LimitTokenCountAnalyzer(new MockAnalyzer(random), 100000)));
Document doc = new Document();
StringBuilder b = new StringBuilder();
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Sat May 14 13:51:35 2011
@@ -185,4 +185,9 @@ public class TestDutchStemmer extends Ba
checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
}
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new DutchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
+
}
\ No newline at end of file
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestNorwegianAnalyzer exten
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new NorwegianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java Sat May 14 13:51:35 2011
@@ -127,4 +127,70 @@ public class TestPathHierarchyTokenizer
new int[]{1, 0, 0, 0},
path.length());
}
+
+ public void testBasicSkip() throws Exception {
+ String path = "/a/b/c";
+ PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+ assertTokenStreamContents(t,
+ new String[]{"/b", "/b/c"},
+ new int[]{2, 2},
+ new int[]{4, 6},
+ new int[]{1, 0},
+ path.length());
+ }
+
+ public void testEndOfDelimiterSkip() throws Exception {
+ String path = "/a/b/c/";
+ PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+ assertTokenStreamContents(t,
+ new String[]{"/b", "/b/c", "/b/c/"},
+ new int[]{2, 2, 2},
+ new int[]{4, 6, 7},
+ new int[]{1, 0, 0},
+ path.length());
+ }
+
+ public void testStartOfCharSkip() throws Exception {
+ String path = "a/b/c";
+ PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+ assertTokenStreamContents(t,
+ new String[]{"/b", "/b/c"},
+ new int[]{1, 1},
+ new int[]{3, 5},
+ new int[]{1, 0},
+ path.length());
+ }
+
+ public void testStartOfCharEndOfDelimiterSkip() throws Exception {
+ String path = "a/b/c/";
+ PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+ assertTokenStreamContents(t,
+ new String[]{"/b", "/b/c", "/b/c/"},
+ new int[]{1, 1, 1},
+ new int[]{3, 5, 6},
+ new int[]{1, 0, 0},
+ path.length());
+ }
+
+ public void testOnlyDelimiterSkip() throws Exception {
+ String path = "/";
+ PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+ assertTokenStreamContents(t,
+ new String[]{},
+ new int[]{},
+ new int[]{},
+ new int[]{},
+ path.length());
+ }
+
+ public void testOnlyDelimitersSkip() throws Exception {
+ String path = "//";
+ PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+ assertTokenStreamContents(t,
+ new String[]{"/"},
+ new int[]{1},
+ new int[]{2},
+ new int[]{1},
+ path.length());
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestPortugueseAnalyzer exte
checkOneTermReuse(a, "quilométricas", "quilométricas");
checkOneTermReuse(a, "quilométricos", "quilométr");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new PortugueseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java Sat May 14 13:51:35 2011
@@ -92,4 +92,9 @@ public class TestPortugueseLightStemFilt
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java Sat May 14 13:51:35 2011
@@ -66,4 +66,9 @@ public class TestPortugueseMinimalStemFi
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java Sat May 14 13:51:35 2011
@@ -66,4 +66,9 @@ public class TestPortugueseStemFilter ex
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestRomanianAnalyzer extend
checkOneTermReuse(a, "absenţa", "absenţa");
checkOneTermReuse(a, "absenţi", "absenţ");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new RomanianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Sat May 14 13:51:35 2011
@@ -64,4 +64,9 @@ public class TestRussianAnalyzer extends
new String[] { "вмеÑÑ", "Ñил", "ÑлекÑÑомагниÑн", "ÑнеÑг", "имел", "пÑедÑÑавление" });
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new RussianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java Sat May 14 13:51:35 2011
@@ -45,4 +45,9 @@ public class TestRussianLightStemFilter
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java Sat May 14 13:51:35 2011
@@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.hu.HungarianAnalyzer;
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
@@ -50,4 +51,9 @@ public class TestSwedishAnalyzer extends
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new SwedishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java Sat May 14 13:51:35 2011
@@ -45,4 +45,9 @@ public class TestSwedishLightStemFilter
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Sat May 14 13:51:35 2011
@@ -17,7 +17,11 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
+import java.io.StringReader;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.util.Version;
/**
@@ -142,5 +146,23 @@ public class TestThaiAnalyzer extends Ba
analyzer,
"à¸à¸£à¸´à¸©à¸±à¸à¸à¸·à¹à¸ XY&Z - à¸à¸¸à¸¢à¸à¸±à¸ xyz@demo.com",
new String[] { "à¸à¸£à¸´à¸©à¸±à¸", "à¸à¸·à¹à¸", "xy&z", "à¸à¸¸à¸¢", "à¸à¸±à¸", "xyz@demo.com" });
- }
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
+
+ // LUCENE-3044
+ public void testAttributeReuse() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
+ // just consume
+ TokenStream ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาà¹à¸à¸¢"));
+ assertTokenStreamContents(ts, new String[] { "ภาษา", "à¹à¸à¸¢" });
+ // this consumer adds flagsAtt, which this analyzer does not use.
+ ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาà¹à¸à¸¢"));
+ ts.addAttribute(FlagsAttribute.class);
+ assertTokenStreamContents(ts, new String[] { "ภาษา", "à¹à¸à¸¢" });
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestTurkishAnalyzer extends
checkOneTermReuse(a, "aÄacı", "aÄacı");
checkOneTermReuse(a, "aÄaç", "aÄaç");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new TurkishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Sat May 14 13:51:35 2011
@@ -186,7 +186,7 @@ public abstract class CollationTestBase
String dkResult) throws Exception {
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)));
+ TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
// document data:
// the tracer field is used to determine which document was hit
@@ -257,27 +257,6 @@ public abstract class CollationTestBase
}
assertEquals(expectedResult, buff.toString());
}
-
- private String randomString() {
- // ideally we could do this!
- // return _TestUtil.randomUnicodeString(random);
- //
- // http://bugs.icu-project.org/trac/ticket/8060
- // http://bugs.icu-project.org/trac/ticket/7732
- // ...
- //
- // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
- int length = _TestUtil.nextInt(random, 0, 10);
- char chars[] = new char[length];
- for (int i = 0; i < length; i++) {
- if (random.nextBoolean()) {
- chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
- } else {
- chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
- }
- }
- return new String(chars, 0, length);
- }
public void assertThreadSafe(final Analyzer analyzer) throws Exception {
int numTestPoints = 100;
@@ -289,7 +268,7 @@ public abstract class CollationTestBase
// and ensure they are the same as the ones we produced in serial fashion.
for (int i = 0; i < numTestPoints; i++) {
- String term = randomString();
+ String term = _TestUtil.randomSimpleString(random);
TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Sat May 14 13:51:35 2011
@@ -87,8 +87,7 @@ public class TestCollationKeyAnalyzer ex
public void testThreadSafe() throws Exception {
int iters = 20 * RANDOM_MULTIPLIER;
for (int i = 0; i < iters; i++) {
- Locale locale = randomLocale(random);
- Collator collator = Collator.getInstance(locale);
+ Collator collator = Collator.getInstance(Locale.GERMAN);
collator.setStrength(Collator.PRIMARY);
assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml Sat May 14 13:51:35 2011
@@ -137,4 +137,20 @@ are part of the ICU4C package. See http:
<m2-deploy-with-pom-template pom.xml="lib/lucene-icu4j-pom.xml.template"
jar.file="lib/icu4j-4_6.jar" />
</target>
+
+ <target name="javadocs" depends="compile-core">
+ <sequential>
+ <mkdir dir="${javadoc.dir}/contrib-${name}"/>
+ <invoke-javadoc
+ destdir="${javadoc.dir}/contrib-${name}"
+ title="${Name} ${version} contrib-${name} API">
+ <sources>
+ <link href="../contrib-analyzers-common"/>
+ <link href=""/>
+ <packageset dir="${src.dir}"/>
+ </sources>
+ </invoke-javadoc>
+ <jarify basedir="${javadoc.dir}/contrib-${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
+ </sequential>
+ </target>
</project>
Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java Sat May 14 13:51:35 2011
@@ -29,15 +29,14 @@ import org.apache.lucene.analysis.core.W
* Tests ICUFoldingFilter
*/
public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new ICUFoldingFilter(
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
+ }
+ };
public void testDefaults() throws IOException {
- Analyzer a = new Analyzer() {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new ICUFoldingFilter(
- new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
- }
- };
-
// case folding
assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
@@ -76,4 +75,9 @@ public class TestICUFoldingFilter extend
// handling of decomposed combining-dot-above
assertAnalyzesTo(a, "eli\u0307f", new String[] { "elif" });
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java Sat May 14 13:51:35 2011
@@ -31,16 +31,15 @@ import com.ibm.icu.text.Normalizer2;
* Tests the ICUNormalizer2Filter
*/
public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new ICUNormalizer2Filter(
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
+ }
+ };
public void testDefaults() throws IOException {
- Analyzer a = new Analyzer() {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new ICUNormalizer2Filter(
- new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
- }
- };
-
// case folding
assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
@@ -75,4 +74,9 @@ public class TestICUNormalizer2Filter ex
// decompose EAcute into E + combining Acute
assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java Sat May 14 13:51:35 2011
@@ -18,10 +18,15 @@ package org.apache.lucene.analysis.icu;
*/
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import com.ibm.icu.text.Transliterator;
@@ -83,4 +88,17 @@ public class TestICUTransformFilter exte
TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform);
assertTokenStreamContents(ts, new String[] { expected });
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ final Transliterator transform = Transliterator.getInstance("Any-Latin");
+ Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
+ }
+ };
+ checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Sat May 14 13:51:35 2011
@@ -232,4 +232,9 @@ public class TestICUTokenizer extends Ba
new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java Sat May 14 13:51:35 2011
@@ -19,6 +19,7 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.util.BytesRef;
@@ -88,7 +89,7 @@ public class TestICUCollationKeyAnalyzer
public void testThreadSafe() throws Exception {
int iters = 20 * RANDOM_MULTIPLIER;
for (int i = 0; i < iters; i++) {
- Locale locale = randomLocale(random);
+ Locale locale = Locale.GERMAN;
Collator collator = Collator.getInstance(locale);
collator.setStrength(Collator.IDENTICAL);
assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
Modified: lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Sat May 14 13:51:35 2011
@@ -75,7 +75,7 @@ class SegGraph {
List<SegToken> result = new ArrayList<SegToken>();
int s = -1, count = 0, size = tokenListTable.size();
List<SegToken> tokenList;
- short index = 0;
+ int index = 0;
while (count < size) {
if (isStartExist(s)) {
tokenList = tokenListTable.get(s);
Modified: lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Sat May 14 13:51:35 2011
@@ -17,8 +17,11 @@
package org.apache.lucene.analysis.cn.smart;
+import java.io.StringReader;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
@@ -166,4 +169,35 @@ public class TestSmartChineseAnalyzer ex
new int[] { 0, 1, 3, 4, 6, 7 },
new int[] { 1, 3, 4, 6, 7, 9 });
}
+
+ // LUCENE-3026
+ public void testLargeDocument() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 5000; i++) {
+ sb.append("æè´ä¹°äºéå
·åæè£
ã");
+ }
+ Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
+ TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
+ stream.reset();
+ while (stream.incrementToken()) {
+ }
+ }
+
+ // LUCENE-3026
+ public void testLargeSentence() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 5000; i++) {
+ sb.append("æè´ä¹°äºéå
·åæè£
");
+ }
+ Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
+ TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
+ stream.reset();
+ while (stream.incrementToken()) {
+ }
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestPolishAnalyzer extends
checkOneTermReuse(a, "studenta", "studenta");
checkOneTermReuse(a, "studenci", "student");
}
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new PolishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Sat May 14 13:51:35 2011
@@ -1,136 +1,136 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-
-/**
- * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
- * which are handled in TrecContentSource. Required to be stateless and hence thread safe.
- */
-public abstract class TrecDocParser {
-
- /** Types of trec parse paths, */
- public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
-
- /** trec parser type used for unknown extensions */
- public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
-
- static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
- static {
- pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
- pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
- pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
- pathType2parser.put(ParsePathType.FT, new TrecFTParser());
- pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
- }
-
- static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
- static {
- for (ParsePathType ppt : ParsePathType.values()) {
- pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
- }
- }
-
- /** max length of walk up from file to its ancestors when looking for a known path type */
- private static final int MAX_PATH_LENGTH = 10;
-
- /**
- * Compute the path type of a file by inspecting name of file and its parents
- */
- public static ParsePathType pathType(File f) {
- int pathLength = 0;
- while (f != null && ++pathLength < MAX_PATH_LENGTH) {
- ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
- if (ppt!=null) {
- return ppt;
- }
- f = f.getParentFile();
- }
- return DEFAULT_PATH_TYPE;
- }
-
- /**
- * parse the text prepared in docBuf into a result DocData,
- * no synchronization is required.
- * @param docData reusable result
- * @param name name that should be set to the result
- * @param trecSrc calling trec content source
- * @param docBuf text to parse
- * @param pathType type of parsed file, or null if unknown - may be used by
- * parsers to alter their behavior according to the file path type.
- */
- public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
-
- /**
- * strip tags from <code>buf</code>: each tag is replaced by a single blank.
- * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
- */
- public static String stripTags(StringBuilder buf, int start) {
- return stripTags(buf.substring(start),0);
- }
-
- /**
- * strip tags from input.
- * @see #stripTags(StringBuilder, int)
- */
- public static String stripTags(String buf, int start) {
- if (start>0) {
- buf = buf.substring(0);
- }
- return buf.replaceAll("<[^>]*>", " ");
- }
-
- /**
- * Extract from <code>buf</code> the text of interest within specified tags
- * @param buf entire input text
- * @param startTag tag marking start of text of interest
- * @param endTag tag marking end of text of interest
- * @param maxPos if ≥ 0 sets a limit on start of text of interest
- * @return text of interest or null if not found
- */
- public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
- int k1 = buf.indexOf(startTag);
- if (k1>=0 && (maxPos<0 || k1<maxPos)) {
- k1 += startTag.length();
- int k2 = buf.indexOf(endTag,k1);
- if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
- if (noisePrefixes != null) {
- for (String noise : noisePrefixes) {
- int k1a = buf.indexOf(noise,k1);
- if (k1a>=0 && k1a<k2) {
- k1 = k1a + noise.length();
- }
- }
- }
- return buf.substring(k1,k2).trim();
- }
- }
- return null;
- }
-
- //public static void main(String[] args) {
- // System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
- //}
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
+ * which are handled in TrecContentSource. Required to be stateless and hence thread safe.
+ */
+public abstract class TrecDocParser {
+
+ /** Types of trec parse paths, */
+ public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
+
+ /** trec parser type used for unknown extensions */
+ public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
+
+ static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
+ static {
+ pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
+ pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
+ pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
+ pathType2parser.put(ParsePathType.FT, new TrecFTParser());
+ pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
+ }
+
+ static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
+ static {
+ for (ParsePathType ppt : ParsePathType.values()) {
+ pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
+ }
+ }
+
+ /** max length of walk up from file to its ancestors when looking for a known path type */
+ private static final int MAX_PATH_LENGTH = 10;
+
+ /**
+ * Compute the path type of a file by inspecting name of file and its parents
+ */
+ public static ParsePathType pathType(File f) {
+ int pathLength = 0;
+ while (f != null && ++pathLength < MAX_PATH_LENGTH) {
+ ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
+ if (ppt!=null) {
+ return ppt;
+ }
+ f = f.getParentFile();
+ }
+ return DEFAULT_PATH_TYPE;
+ }
+
+ /**
+ * parse the text prepared in docBuf into a result DocData,
+ * no synchronization is required.
+ * @param docData reusable result
+ * @param name name that should be set to the result
+ * @param trecSrc calling trec content source
+ * @param docBuf text to parse
+ * @param pathType type of parsed file, or null if unknown - may be used by
+ * parsers to alter their behavior according to the file path type.
+ */
+ public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
+
+ /**
+ * strip tags from <code>buf</code>: each tag is replaced by a single blank.
+ * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
+ */
+ public static String stripTags(StringBuilder buf, int start) {
+ return stripTags(buf.substring(start),0);
+ }
+
+ /**
+ * strip tags from input.
+ * @see #stripTags(StringBuilder, int)
+ */
+ public static String stripTags(String buf, int start) {
+ if (start>0) {
+ buf = buf.substring(0);
+ }
+ return buf.replaceAll("<[^>]*>", " ");
+ }
+
+ /**
+ * Extract from <code>buf</code> the text of interest within specified tags
+ * @param buf entire input text
+ * @param startTag tag marking start of text of interest
+ * @param endTag tag marking end of text of interest
+ * @param maxPos if ≥ 0 sets a limit on start of text of interest
+ * @return text of interest or null if not found
+ */
+ public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
+ int k1 = buf.indexOf(startTag);
+ if (k1>=0 && (maxPos<0 || k1<maxPos)) {
+ k1 += startTag.length();
+ int k2 = buf.indexOf(endTag,k1);
+ if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
+ if (noisePrefixes != null) {
+ for (String noise : noisePrefixes) {
+ int k1a = buf.indexOf(noise,k1);
+ if (k1a>=0 && k1a<k2) {
+ k1 = k1a + noise.length();
+ }
+ }
+ }
+ return buf.substring(k1,k2).trim();
+ }
+ }
+ return null;
+ }
+
+ //public static void main(String[] args) {
+ // System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
+ //}
+
+}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java Sat May 14 13:51:35 2011
@@ -1,65 +1,65 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FBIS docs in trec disks 4+5 collection format
- */
-public class TrecFBISParser extends TrecDocParser {
-
- private static final String HEADER = "<HEADER>";
- private static final String HEADER_END = "</HEADER>";
- private static final int HEADER_END_LENGTH = HEADER_END.length();
-
- private static final String DATE1 = "<DATE1>";
- private static final String DATE1_END = "</DATE1>";
-
- private static final String TI = "<TI>";
- private static final String TI_END = "</TI>";
-
- @Override
- public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- int mark = 0; // that much is skipped
- // optionally skip some of the text, set date, title
- Date date = null;
- String title = null;
- int h1 = docBuf.indexOf(HEADER);
- if (h1>=0) {
- int h2 = docBuf.indexOf(HEADER_END,h1);
- mark = h2+HEADER_END_LENGTH;
- // date...
- String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
- if (dateStr != null) {
- date = trecSrc.parseDate(dateStr);
- }
- // title...
- title = extract(docBuf, TI, TI_END, h2, null);
- }
- docData.clear();
- docData.setName(name);
- docData.setDate(date);
- docData.setTitle(title);
- docData.setBody(stripTags(docBuf, mark).toString());
- return docData;
- }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FBIS docs in trec disks 4+5 collection format
+ */
+public class TrecFBISParser extends TrecDocParser {
+
+ private static final String HEADER = "<HEADER>";
+ private static final String HEADER_END = "</HEADER>";
+ private static final int HEADER_END_LENGTH = HEADER_END.length();
+
+ private static final String DATE1 = "<DATE1>";
+ private static final String DATE1_END = "</DATE1>";
+
+ private static final String TI = "<TI>";
+ private static final String TI_END = "</TI>";
+
+ @Override
+ public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ int mark = 0; // that much is skipped
+ // optionally skip some of the text, set date, title
+ Date date = null;
+ String title = null;
+ int h1 = docBuf.indexOf(HEADER);
+ if (h1>=0) {
+ int h2 = docBuf.indexOf(HEADER_END,h1);
+ mark = h2+HEADER_END_LENGTH;
+ // date...
+ String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
+ if (dateStr != null) {
+ date = trecSrc.parseDate(dateStr);
+ }
+ // title...
+ title = extract(docBuf, TI, TI_END, h2, null);
+ }
+ docData.clear();
+ docData.setName(name);
+ docData.setDate(date);
+ docData.setTitle(title);
+ docData.setBody(stripTags(docBuf, mark).toString());
+ return docData;
+ }
+
+}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java Sat May 14 13:51:35 2011
@@ -1,66 +1,66 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FR94 docs in trec disks 4+5 collection format
- */
-public class TrecFR94Parser extends TrecDocParser {
-
- private static final String TEXT = "<TEXT>";
- private static final int TEXT_LENGTH = TEXT.length();
- private static final String TEXT_END = "</TEXT>";
-
- private static final String DATE = "<DATE>";
- private static final String[] DATE_NOISE_PREFIXES = {
- "DATE:",
- "date:", //TODO improve date extraction for this format
- "t.c.",
- };
- private static final String DATE_END = "</DATE>";
-
- //TODO can we also extract title for this format?
-
- @Override
- public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- int mark = 0; // that much is skipped
- // optionally skip some of the text, set date (no title?)
- Date date = null;
- int h1 = docBuf.indexOf(TEXT);
- if (h1>=0) {
- int h2 = docBuf.indexOf(TEXT_END,h1);
- mark = h1+TEXT_LENGTH;
- // date...
- String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
- if (dateStr != null) {
- dateStr = stripTags(dateStr,0).toString();
- date = trecSrc.parseDate(dateStr.trim());
- }
- }
- docData.clear();
- docData.setName(name);
- docData.setDate(date);
- docData.setBody(stripTags(docBuf, mark).toString());
- return docData;
- }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FR94 docs in trec disks 4+5 collection format
+ */
+public class TrecFR94Parser extends TrecDocParser {
+
+ private static final String TEXT = "<TEXT>";
+ private static final int TEXT_LENGTH = TEXT.length();
+ private static final String TEXT_END = "</TEXT>";
+
+ private static final String DATE = "<DATE>";
+ private static final String[] DATE_NOISE_PREFIXES = {
+ "DATE:",
+ "date:", //TODO improve date extraction for this format
+ "t.c.",
+ };
+ private static final String DATE_END = "</DATE>";
+
+ //TODO can we also extract title for this format?
+
+ @Override
+ public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ int mark = 0; // that much is skipped
+ // optionally skip some of the text, set date (no title?)
+ Date date = null;
+ int h1 = docBuf.indexOf(TEXT);
+ if (h1>=0) {
+ int h2 = docBuf.indexOf(TEXT_END,h1);
+ mark = h1+TEXT_LENGTH;
+ // date...
+ String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
+ if (dateStr != null) {
+ dateStr = stripTags(dateStr,0).toString();
+ date = trecSrc.parseDate(dateStr.trim());
+ }
+ }
+ docData.clear();
+ docData.setName(name);
+ docData.setDate(date);
+ docData.setBody(stripTags(docBuf, mark).toString());
+ return docData;
+ }
+
+}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java Sat May 14 13:51:35 2011
@@ -1,57 +1,57 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FT docs in trec disks 4+5 collection format
- */
-public class TrecFTParser extends TrecDocParser {
-
- private static final String DATE = "<DATE>";
- private static final String DATE_END = "</DATE>";
-
- private static final String HEADLINE = "<HEADLINE>";
- private static final String HEADLINE_END = "</HEADLINE>";
-
- @Override
- public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- int mark = 0; // that much is skipped
-
- // date...
- Date date = null;
- String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
- if (dateStr != null) {
- date = trecSrc.parseDate(dateStr);
- }
-
- // title...
- String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
-
- docData.clear();
- docData.setName(name);
- docData.setDate(date);
- docData.setTitle(title);
- docData.setBody(stripTags(docBuf, mark).toString());
- return docData;
- }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FT docs in trec disks 4+5 collection format
+ */
+public class TrecFTParser extends TrecDocParser {
+
+ private static final String DATE = "<DATE>";
+ private static final String DATE_END = "</DATE>";
+
+ private static final String HEADLINE = "<HEADLINE>";
+ private static final String HEADLINE_END = "</HEADLINE>";
+
+ @Override
+ public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ int mark = 0; // that much is skipped
+
+ // date...
+ Date date = null;
+ String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
+ if (dateStr != null) {
+ date = trecSrc.parseDate(dateStr);
+ }
+
+ // title...
+ String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
+
+ docData.clear();
+ docData.setName(name);
+ docData.setDate(date);
+ docData.setTitle(title);
+ docData.setBody(stripTags(docBuf, mark).toString());
+ return docData;
+ }
+
+}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java Sat May 14 13:51:35 2011
@@ -1,71 +1,71 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FT docs in trec disks 4+5 collection format
- */
-public class TrecLATimesParser extends TrecDocParser {
-
- private static final String DATE = "<DATE>";
- private static final String DATE_END = "</DATE>";
- private static final String DATE_NOISE = "day,"; // anything aftre the ','
-
- private static final String SUBJECT = "<SUBJECT>";
- private static final String SUBJECT_END = "</SUBJECT>";
- private static final String HEADLINE = "<HEADLINE>";
- private static final String HEADLINE_END = "</HEADLINE>";
-
- @Override
- public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- int mark = 0; // that much is skipped
-
- // date...
- Date date = null;
- String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
- if (dateStr != null) {
- int d2a = dateStr.indexOf(DATE_NOISE);
- if (d2a > 0) {
- dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
- }
- dateStr = stripTags(dateStr,0).toString();
- date = trecSrc.parseDate(dateStr.trim());
- }
-
- // title... first try with SUBJECT, them with HEADLINE
- String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
- if (title==null) {
- title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
- }
- if (title!=null) {
- title = stripTags(title,0).toString().trim();
- }
-
- docData.clear();
- docData.setName(name);
- docData.setDate(date);
- docData.setTitle(title);
- docData.setBody(stripTags(docBuf, mark).toString());
- return docData;
- }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FT docs in trec disks 4+5 collection format
+ */
+public class TrecLATimesParser extends TrecDocParser {
+
+ private static final String DATE = "<DATE>";
+ private static final String DATE_END = "</DATE>";
+ private static final String DATE_NOISE = "day,"; // anything aftre the ','
+
+ private static final String SUBJECT = "<SUBJECT>";
+ private static final String SUBJECT_END = "</SUBJECT>";
+ private static final String HEADLINE = "<HEADLINE>";
+ private static final String HEADLINE_END = "</HEADLINE>";
+
+ @Override
+ public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ int mark = 0; // that much is skipped
+
+ // date...
+ Date date = null;
+ String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
+ if (dateStr != null) {
+ int d2a = dateStr.indexOf(DATE_NOISE);
+ if (d2a > 0) {
+ dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
+ }
+ dateStr = stripTags(dateStr,0).toString();
+ date = trecSrc.parseDate(dateStr.trim());
+ }
+
+ // title... first try with SUBJECT, them with HEADLINE
+ String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
+ if (title==null) {
+ title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
+ }
+ if (title!=null) {
+ title = stripTags(title,0).toString().trim();
+ }
+
+ docData.clear();
+ docData.setName(name);
+ docData.setDate(date);
+ docData.setTitle(title);
+ docData.setBody(stripTags(docBuf, mark).toString());
+ return docData;
+ }
+
+}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java Sat May 14 13:51:35 2011
@@ -1,33 +1,33 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-/**
- * Parser for trec docs which selects the parser to apply according
- * to the source files path, defaulting to {@link TrecGov2Parser}.
- */
-public class TrecParserByPath extends TrecDocParser {
-
- @Override
- public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
- }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+/**
+ * Parser for trec docs which selects the parser to apply according
+ * to the source files path, defaulting to {@link TrecGov2Parser}.
+ */
+public class TrecParserByPath extends TrecDocParser {
+
+ @Override
+ public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
+ }
+
+}
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Sat May 14 13:51:35 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexDele
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.MergePolicy;
@@ -150,6 +151,9 @@ public class CreateIndexTask extends Per
LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
logMergePolicy.setUseCompoundFile(isCompound);
logMergePolicy.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
+ } else if(iwConf.getMergePolicy() instanceof TieredMergePolicy) {
+ TieredMergePolicy tieredMergePolicy = (TieredMergePolicy) iwConf.getMergePolicy();
+ tieredMergePolicy.setUseCompoundFile(isCompound);
}
}
final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java Sat May 14 13:51:35 2011
@@ -43,7 +43,8 @@ import org.apache.lucene.document.Field;
* <p>
* The format of the output is set according to the output file extension.
* Compression is recommended when the output file is expected to be large.
- * See info on file extensions in {@link StreamUtils.Type}
+ * See info on file extensions in
+ * {@link org.apache.lucene.benchmark.byTask.utils.StreamUtils.Type}
* <p>
* Supports the following parameters:
* <ul>
Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Sat May 14 13:51:35 2011
@@ -96,7 +96,7 @@ public class TestPerfTasksLogic extends
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+ new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
.setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
@@ -183,7 +183,7 @@ public class TestPerfTasksLogic extends
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
- IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
+ IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
@@ -222,7 +222,7 @@ public class TestPerfTasksLogic extends
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
- IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
+ IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
@@ -295,7 +295,7 @@ public class TestPerfTasksLogic extends
assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
- IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
+ IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
@@ -407,7 +407,7 @@ public class TestPerfTasksLogic extends
// Index the line docs
String algLines2[] = {
"# ----- properties ",
- "analyzer=org.apache.lucene.analysis.MockAnalyzer",
+ "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
"content.source.forever=false",
@@ -425,7 +425,7 @@ public class TestPerfTasksLogic extends
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+ new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
.setOpenMode(OpenMode.APPEND));
iw.close();
@@ -448,7 +448,7 @@ public class TestPerfTasksLogic extends
// then build index from the same docs
String algLines1[] = {
"# ----- properties ",
- "analyzer=org.apache.lucene.analysis.MockAnalyzer",
+ "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"# ----- alg ",
@@ -1021,18 +1021,18 @@ public class TestPerfTasksLogic extends
"two three four", "three four",
"three four five", "four five",
"four five six", "five six" });
- // MockAnalyzer, default maxShingleSize and outputUnigrams
+ // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
benchmark = execBenchmark
- (getShingleConfig("analyzer:MockAnalyzer"));
+ (getShingleConfig("analyzer:WhitespaceAnalyzer"));
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
new String[] { "one,two,three,", "one,two,three, four",
"four", "four five", "five", "five six",
"six" });
- // MockAnalyzer, maxShingleSize=3 and outputUnigrams=false
+ // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
benchmark = execBenchmark
(getShingleConfig
- ("outputUnigrams:false,maxShingleSize:3,analyzer:MockAnalyzer"));
+ ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
new String[] { "one,two,three, four",
"one,two,three, four five",