You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/05/14 15:51:59 UTC

svn commit: r1103112 [18/24] - in /lucene/dev/branches/flexscoring: ./ dev-tools/eclipse/ dev-tools/idea/.idea/ dev-tools/idea/lucene/contrib/ant/ dev-tools/idea/lucene/contrib/db/bdb-je/ dev-tools/idea/lucene/contrib/db/bdb/ dev-tools/idea/lucene/cont...

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestIndonesianAnalyzer exte
     checkOneTermReuse(a, "peledakan", "peledakan");
     checkOneTermReuse(a, "pembunuhan", "bunuh");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new IndonesianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java Sat May 14 13:51:35 2011
@@ -23,6 +23,7 @@ import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
 
 public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
   /** This test fails with NPE when the 
@@ -50,4 +51,23 @@ public class TestItalianAnalyzer extends
     checkOneTermReuse(a, "abbandonata", "abbandonata");
     checkOneTermReuse(a, "abbandonati", "abbandon");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
+  
+  /** test that the elisionfilter is working */
+  public void testContractions() throws IOException {
+    Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
+    assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+  }
+  
+  /** test that we don't enable this before 3.2*/
+  public void testContractionsBackwards() throws IOException {
+    Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
+    assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java Sat May 14 13:51:35 2011
@@ -45,4 +45,9 @@ public class TestItalianLightStemFilter 
   public void testVocabulary() throws IOException {
     assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java Sat May 14 13:51:35 2011
@@ -51,7 +51,7 @@ public class TestLimitTokenCountAnalyzer
     Directory dir = newDirectory();
 
     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
-        TEST_VERSION_CURRENT, new LimitTokenCountAnalyzer(new MockAnalyzer(), 100000)));
+        TEST_VERSION_CURRENT, new LimitTokenCountAnalyzer(new MockAnalyzer(random), 100000)));
 
     Document doc = new Document();
     StringBuilder b = new StringBuilder();

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Sat May 14 13:51:35 2011
@@ -185,4 +185,9 @@ public class TestDutchStemmer extends Ba
     checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected); 
   }
   
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new DutchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
+  
 }
\ No newline at end of file

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestNorwegianAnalyzer exten
     checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
     checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new NorwegianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java Sat May 14 13:51:35 2011
@@ -127,4 +127,70 @@ public class TestPathHierarchyTokenizer 
         new int[]{1, 0, 0, 0},
         path.length());
   }
+
+  public void testBasicSkip() throws Exception {
+    String path = "/a/b/c";
+    PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+    assertTokenStreamContents(t,
+        new String[]{"/b", "/b/c"},
+        new int[]{2, 2},
+        new int[]{4, 6},
+        new int[]{1, 0},
+        path.length());
+  }
+
+  public void testEndOfDelimiterSkip() throws Exception {
+    String path = "/a/b/c/";
+    PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+    assertTokenStreamContents(t,
+        new String[]{"/b", "/b/c", "/b/c/"},
+        new int[]{2, 2, 2},
+        new int[]{4, 6, 7},
+        new int[]{1, 0, 0},
+        path.length());
+  }
+
+  public void testStartOfCharSkip() throws Exception {
+    String path = "a/b/c";
+    PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+    assertTokenStreamContents(t,
+        new String[]{"/b", "/b/c"},
+        new int[]{1, 1},
+        new int[]{3, 5},
+        new int[]{1, 0},
+        path.length());
+  }
+
+  public void testStartOfCharEndOfDelimiterSkip() throws Exception {
+    String path = "a/b/c/";
+    PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+    assertTokenStreamContents(t,
+        new String[]{"/b", "/b/c", "/b/c/"},
+        new int[]{1, 1, 1},
+        new int[]{3, 5, 6},
+        new int[]{1, 0, 0},
+        path.length());
+  }
+
+  public void testOnlyDelimiterSkip() throws Exception {
+    String path = "/";
+    PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+    assertTokenStreamContents(t,
+        new String[]{},
+        new int[]{},
+        new int[]{},
+        new int[]{},
+        path.length());
+  }
+
+  public void testOnlyDelimitersSkip() throws Exception {
+    String path = "//";
+    PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
+    assertTokenStreamContents(t,
+        new String[]{"/"},
+        new int[]{1},
+        new int[]{2},
+        new int[]{1},
+        path.length());
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestPortugueseAnalyzer exte
     checkOneTermReuse(a, "quilométricas", "quilométricas");
     checkOneTermReuse(a, "quilométricos", "quilométr");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new PortugueseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java Sat May 14 13:51:35 2011
@@ -92,4 +92,9 @@ public class TestPortugueseLightStemFilt
   public void testVocabulary() throws IOException {
     assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java Sat May 14 13:51:35 2011
@@ -66,4 +66,9 @@ public class TestPortugueseMinimalStemFi
   public void testVocabulary() throws IOException {
     assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java Sat May 14 13:51:35 2011
@@ -66,4 +66,9 @@ public class TestPortugueseStemFilter ex
   public void testVocabulary() throws IOException {
     assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestRomanianAnalyzer extend
     checkOneTermReuse(a, "absenţa", "absenţa");
     checkOneTermReuse(a, "absenţi", "absenţ");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new RomanianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Sat May 14 13:51:35 2011
@@ -64,4 +64,9 @@ public class TestRussianAnalyzer extends
           new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
      
     }
+    
+    /** blast some random strings through the analyzer */
+    public void testRandomStrings() throws Exception {
+      checkRandomData(random, new RussianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+    }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java Sat May 14 13:51:35 2011
@@ -45,4 +45,9 @@ public class TestRussianLightStemFilter 
   public void testVocabulary() throws IOException {
     assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java Sat May 14 13:51:35 2011
@@ -23,6 +23,7 @@ import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.hu.HungarianAnalyzer;
 
 public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
   /** This test fails with NPE when the 
@@ -50,4 +51,9 @@ public class TestSwedishAnalyzer extends
     checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
     checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new SwedishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java Sat May 14 13:51:35 2011
@@ -45,4 +45,9 @@ public class TestSwedishLightStemFilter 
   public void testVocabulary() throws IOException {
     assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Sat May 14 13:51:35 2011
@@ -17,7 +17,11 @@ package org.apache.lucene.analysis.th;
  * limitations under the License.
  */
 
+import java.io.StringReader;
+
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.util.Version;
 
 /**
@@ -142,5 +146,23 @@ public class TestThaiAnalyzer extends Ba
             analyzer,
             "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
             new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
-    }
+  }
+	
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
+  
+  // LUCENE-3044
+  public void testAttributeReuse() throws Exception {
+    assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+    ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
+    // just consume
+    TokenStream ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
+    assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
+    // this consumer adds flagsAtt, which this analyzer does not use. 
+    ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
+    ts.addAttribute(FlagsAttribute.class);
+    assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestTurkishAnalyzer extends
     checkOneTermReuse(a, "ağacı", "ağacı");
     checkOneTermReuse(a, "ağaç", "ağaç");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new TurkishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Sat May 14 13:51:35 2011
@@ -186,7 +186,7 @@ public abstract class CollationTestBase 
                                    String dkResult) throws Exception {
     RAMDirectory indexStore = new RAMDirectory();
     IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
-        TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)));
+        TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
 
     // document data:
     // the tracer field is used to determine which document was hit
@@ -257,27 +257,6 @@ public abstract class CollationTestBase 
     }
     assertEquals(expectedResult, buff.toString());
   }
-  
-  private String randomString() {
-    // ideally we could do this!
-    // return _TestUtil.randomUnicodeString(random);
-    //
-    // http://bugs.icu-project.org/trac/ticket/8060
-    // http://bugs.icu-project.org/trac/ticket/7732
-    // ...
-    // 
-    // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
-    int length = _TestUtil.nextInt(random, 0, 10);
-    char chars[] = new char[length];
-    for (int i = 0; i < length; i++) {
-      if (random.nextBoolean()) {
-        chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
-      } else {
-        chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
-      }
-    }
-    return new String(chars, 0, length);
-  }
 
   public void assertThreadSafe(final Analyzer analyzer) throws Exception {
     int numTestPoints = 100;
@@ -289,7 +268,7 @@ public abstract class CollationTestBase 
     // and ensure they are the same as the ones we produced in serial fashion.
 
     for (int i = 0; i < numTestPoints; i++) {
-      String term = randomString();
+      String term = _TestUtil.randomSimpleString(random);
       TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
       TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
       BytesRef bytes = termAtt.getBytesRef();

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Sat May 14 13:51:35 2011
@@ -87,8 +87,7 @@ public class TestCollationKeyAnalyzer ex
   public void testThreadSafe() throws Exception {
     int iters = 20 * RANDOM_MULTIPLIER;
     for (int i = 0; i < iters; i++) {
-      Locale locale = randomLocale(random);
-      Collator collator = Collator.getInstance(locale);
+      Collator collator = Collator.getInstance(Locale.GERMAN);
       collator.setStrength(Collator.PRIMARY);
       assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
     }

Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/build.xml Sat May 14 13:51:35 2011
@@ -137,4 +137,20 @@ are part of the ICU4C package. See http:
     <m2-deploy-with-pom-template pom.xml="lib/lucene-icu4j-pom.xml.template"
                                  jar.file="lib/icu4j-4_6.jar" />
   </target>
+
+  <target name="javadocs" depends="compile-core">
+   	<sequential>
+       <mkdir dir="${javadoc.dir}/contrib-${name}"/>
+       <invoke-javadoc
+         destdir="${javadoc.dir}/contrib-${name}"
+       	 title="${Name} ${version} contrib-${name} API">
+         <sources>
+           <link href="../contrib-analyzers-common"/>
+           <link href=""/>
+           <packageset dir="${src.dir}"/>
+        </sources>
+      </invoke-javadoc>
+      <jarify basedir="${javadoc.dir}/contrib-${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
+    </sequential>
+  </target>	
 </project>

Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java Sat May 14 13:51:35 2011
@@ -29,15 +29,14 @@ import org.apache.lucene.analysis.core.W
  * Tests ICUFoldingFilter
  */
 public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
+  Analyzer a = new Analyzer() {
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new ICUFoldingFilter(
+          new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
+    }
+  };
   public void testDefaults() throws IOException {
-    Analyzer a = new Analyzer() {
-      @Override
-      public TokenStream tokenStream(String fieldName, Reader reader) {
-        return new ICUFoldingFilter(
-            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
-      }
-    };
-
     // case folding
     assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
 
@@ -76,4 +75,9 @@ public class TestICUFoldingFilter extend
     // handling of decomposed combining-dot-above
     assertAnalyzesTo(a, "eli\u0307f", new String[] { "elif" });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java Sat May 14 13:51:35 2011
@@ -31,16 +31,15 @@ import com.ibm.icu.text.Normalizer2;
  * Tests the ICUNormalizer2Filter
  */
 public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
+  Analyzer a = new Analyzer() {
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new ICUNormalizer2Filter(
+          new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
+    }
+  };
 
   public void testDefaults() throws IOException {
-    Analyzer a = new Analyzer() {
-      @Override
-      public TokenStream tokenStream(String fieldName, Reader reader) {
-        return new ICUNormalizer2Filter(
-            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
-      }
-    };
-
     // case folding
     assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
 
@@ -75,4 +74,9 @@ public class TestICUNormalizer2Filter ex
     // decompose EAcute into E + combining Acute
     assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java Sat May 14 13:51:35 2011
@@ -18,10 +18,15 @@ package org.apache.lucene.analysis.icu;
  */
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 
 import com.ibm.icu.text.Transliterator;
@@ -83,4 +88,17 @@ public class TestICUTransformFilter exte
     TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform);
     assertTokenStreamContents(ts, new String[] { expected });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final Transliterator transform = Transliterator.getInstance("Any-Latin");
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+        return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
+      }
+    };
+    checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Sat May 14 13:51:35 2011
@@ -232,4 +232,9 @@ public class TestICUTokenizer extends Ba
         new String[] { "仮", "名", "遣", "い", "カタカナ" },
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java Sat May 14 13:51:35 2011
@@ -19,6 +19,7 @@ package org.apache.lucene.collation;
 
 
 import com.ibm.icu.text.Collator;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.codecs.CodecProvider;
 import org.apache.lucene.util.BytesRef;
@@ -88,7 +89,7 @@ public class TestICUCollationKeyAnalyzer
   public void testThreadSafe() throws Exception {
     int iters = 20 * RANDOM_MULTIPLIER;
     for (int i = 0; i < iters; i++) {
-      Locale locale = randomLocale(random);
+      Locale locale = Locale.GERMAN;
       Collator collator = Collator.getInstance(locale);
       collator.setStrength(Collator.IDENTICAL);
       assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));

Modified: lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Sat May 14 13:51:35 2011
@@ -75,7 +75,7 @@ class SegGraph {
     List<SegToken> result = new ArrayList<SegToken>();
     int s = -1, count = 0, size = tokenListTable.size();
     List<SegToken> tokenList;
-    short index = 0;
+    int index = 0;
     while (count < size) {
       if (isStartExist(s)) {
         tokenList = tokenListTable.get(s);

Modified: lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Sat May 14 13:51:35 2011
@@ -17,8 +17,11 @@
 
 package org.apache.lucene.analysis.cn.smart;
 
+import java.io.StringReader;
+
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
 
 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
@@ -166,4 +169,35 @@ public class TestSmartChineseAnalyzer ex
         new int[] { 0, 1, 3, 4, 6, 7 },
         new int[] { 1, 3, 4, 6, 7, 9 });
   }
+  
+  // LUCENE-3026
+  public void testLargeDocument() throws Exception {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 5000; i++) {
+      sb.append("我购买了道具和服装。");
+    }
+    Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
+    TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
+    stream.reset();
+    while (stream.incrementToken()) {
+    }
+  }
+  
+  // LUCENE-3026
+  public void testLargeSentence() throws Exception {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 5000; i++) {
+      sb.append("我购买了道具和服装");
+    }
+    Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
+    TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
+    stream.reset();
+    while (stream.incrementToken()) {
+    }
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Sat May 14 13:51:35 2011
@@ -50,4 +50,9 @@ public class TestPolishAnalyzer extends 
     checkOneTermReuse(a, "studenta", "studenta");
     checkOneTermReuse(a, "studenci", "student");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new PolishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Sat May 14 13:51:35 2011
@@ -1,136 +1,136 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-
-/** 
- * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
- * which are handled in TrecContentSource. Required to be stateless and hence thread safe. 
- */
-public abstract class TrecDocParser {
-
-  /** Types of trec parse paths, */
-  public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
-  
-  /** trec parser type used for unknown extensions */
-  public static final ParsePathType DEFAULT_PATH_TYPE  = ParsePathType.GOV2;
-
-  static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
-  static {
-    pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
-    pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
-    pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
-    pathType2parser.put(ParsePathType.FT, new TrecFTParser());
-    pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
-  }
-
-  static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
-  static {
-    for (ParsePathType ppt : ParsePathType.values()) {
-      pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
-    }
-  }
-  
-  /** max length of walk up from file to its ancestors when looking for a known path type */ 
-  private static final int MAX_PATH_LENGTH = 10;
-  
-  /**
-   * Compute the path type of a file by inspecting name of file and its parents
-   */
-  public static ParsePathType pathType(File f) {
-    int pathLength = 0;
-    while (f != null && ++pathLength < MAX_PATH_LENGTH) {
-      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
-      if (ppt!=null) {
-        return ppt;
-      }
-      f = f.getParentFile();
-    }
-    return DEFAULT_PATH_TYPE;
-  }
-  
-  /** 
-   * parse the text prepared in docBuf into a result DocData, 
-   * no synchronization is required.
-   * @param docData reusable result
-   * @param name name that should be set to the result
-   * @param trecSrc calling trec content source  
-   * @param docBuf text to parse  
-   * @param pathType type of parsed file, or null if unknown - may be used by 
-   * parsers to alter their behavior according to the file path type. 
-   */  
-  public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
-  
-  /** 
-   * strip tags from <code>buf</code>: each tag is replaced by a single blank.
-   * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
-   */
-  public static String stripTags(StringBuilder buf, int start) {
-    return stripTags(buf.substring(start),0);
-  }
-
-  /** 
-   * strip tags from input.
-   * @see #stripTags(StringBuilder, int)
-   */
-  public static String stripTags(String buf, int start) {
-    if (start>0) {
-      buf = buf.substring(0);
-    }
-    return buf.replaceAll("<[^>]*>", " ");
-  }
-  
-  /**
-   * Extract from <code>buf</code> the text of interest within specified tags
-   * @param buf entire input text
-   * @param startTag tag marking start of text of interest 
-   * @param endTag tag marking end of text of interest
-   * @param maxPos if &ge; 0 sets a limit on start of text of interest
-   * @return text of interest or null if not found
-   */
-  public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
-    int k1 = buf.indexOf(startTag);
-    if (k1>=0 && (maxPos<0 || k1<maxPos)) {
-      k1 += startTag.length();
-      int k2 = buf.indexOf(endTag,k1);
-      if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
-        if (noisePrefixes != null) {
-          for (String noise : noisePrefixes) {
-            int k1a = buf.indexOf(noise,k1);
-            if (k1a>=0 && k1a<k2) {
-              k1 = k1a + noise.length();
-            }
-          }          
-        }
-        return buf.substring(k1,k2).trim();
-      }
-    }
-    return null;
-  }
-
-  //public static void main(String[] args) {
-  //  System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
-  //}
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/** 
+ * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
+ * which are handled in TrecContentSource. Required to be stateless and hence thread safe. 
+ */
+public abstract class TrecDocParser {
+
+  /** Types of trec parse paths, */
+  public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
+  
+  /** trec parser type used for unknown extensions */
+  public static final ParsePathType DEFAULT_PATH_TYPE  = ParsePathType.GOV2;
+
+  static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
+  static {
+    pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
+    pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
+    pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
+    pathType2parser.put(ParsePathType.FT, new TrecFTParser());
+    pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
+  }
+
+  static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
+  static {
+    for (ParsePathType ppt : ParsePathType.values()) {
+      pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
+    }
+  }
+  
+  /** max length of walk up from file to its ancestors when looking for a known path type */ 
+  private static final int MAX_PATH_LENGTH = 10;
+  
+  /**
+   * Compute the path type of a file by inspecting name of file and its parents
+   */
+  public static ParsePathType pathType(File f) {
+    int pathLength = 0;
+    while (f != null && ++pathLength < MAX_PATH_LENGTH) {
+      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
+      if (ppt!=null) {
+        return ppt;
+      }
+      f = f.getParentFile();
+    }
+    return DEFAULT_PATH_TYPE;
+  }
+  
+  /** 
+   * parse the text prepared in docBuf into a result DocData, 
+   * no synchronization is required.
+   * @param docData reusable result
+   * @param name name that should be set to the result
+   * @param trecSrc calling trec content source  
+   * @param docBuf text to parse  
+   * @param pathType type of parsed file, or null if unknown - may be used by 
+   * parsers to alter their behavior according to the file path type. 
+   */  
+  public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
+      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
+  
+  /** 
+   * strip tags from <code>buf</code>: each tag is replaced by a single blank.
+   * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
+   */
+  public static String stripTags(StringBuilder buf, int start) {
+    return stripTags(buf.substring(start),0);
+  }
+
+  /** 
+   * strip tags from input.
+   * @see #stripTags(StringBuilder, int)
+   */
+  public static String stripTags(String buf, int start) {
+    if (start>0) {
+      buf = buf.substring(0);
+    }
+    return buf.replaceAll("<[^>]*>", " ");
+  }
+  
+  /**
+   * Extract from <code>buf</code> the text of interest within specified tags
+   * @param buf entire input text
+   * @param startTag tag marking start of text of interest 
+   * @param endTag tag marking end of text of interest
+   * @param maxPos if &ge; 0 sets a limit on start of text of interest
+   * @return text of interest or null if not found
+   */
+  public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
+    int k1 = buf.indexOf(startTag);
+    if (k1>=0 && (maxPos<0 || k1<maxPos)) {
+      k1 += startTag.length();
+      int k2 = buf.indexOf(endTag,k1);
+      if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
+        if (noisePrefixes != null) {
+          for (String noise : noisePrefixes) {
+            int k1a = buf.indexOf(noise,k1);
+            if (k1a>=0 && k1a<k2) {
+              k1 = k1a + noise.length();
+            }
+          }          
+        }
+        return buf.substring(k1,k2).trim();
+      }
+    }
+    return null;
+  }
+
+  //public static void main(String[] args) {
+  //  System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
+  //}
+
+}

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java Sat May 14 13:51:35 2011
@@ -1,65 +1,65 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FBIS docs in trec disks 4+5 collection format
- */
-public class TrecFBISParser extends TrecDocParser {
-
-  private static final String HEADER = "<HEADER>";
-  private static final String HEADER_END = "</HEADER>";
-  private static final int HEADER_END_LENGTH = HEADER_END.length();
-  
-  private static final String DATE1 = "<DATE1>";
-  private static final String DATE1_END = "</DATE1>";
-  
-  private static final String TI = "<TI>";
-  private static final String TI_END = "</TI>";
-
-  @Override
-  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    int mark = 0; // that much is skipped
-    // optionally skip some of the text, set date, title
-    Date date = null;
-    String title = null;
-    int h1 = docBuf.indexOf(HEADER);
-    if (h1>=0) {
-      int h2 = docBuf.indexOf(HEADER_END,h1);
-      mark = h2+HEADER_END_LENGTH;
-      // date...
-      String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
-      if (dateStr != null) {
-        date = trecSrc.parseDate(dateStr);
-      }
-      // title...
-      title = extract(docBuf, TI, TI_END, h2, null);
-    }
-    docData.clear();
-    docData.setName(name);
-    docData.setDate(date);
-    docData.setTitle(title);
-    docData.setBody(stripTags(docBuf, mark).toString());
-    return docData;
-  }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FBIS docs in trec disks 4+5 collection format
+ */
+public class TrecFBISParser extends TrecDocParser {
+
+  private static final String HEADER = "<HEADER>";
+  private static final String HEADER_END = "</HEADER>";
+  private static final int HEADER_END_LENGTH = HEADER_END.length();
+  
+  private static final String DATE1 = "<DATE1>";
+  private static final String DATE1_END = "</DATE1>";
+  
+  private static final String TI = "<TI>";
+  private static final String TI_END = "</TI>";
+
+  @Override
+  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
+      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+    int mark = 0; // that much is skipped
+    // optionally skip some of the text, set date, title
+    Date date = null;
+    String title = null;
+    int h1 = docBuf.indexOf(HEADER);
+    if (h1>=0) {
+      int h2 = docBuf.indexOf(HEADER_END,h1);
+      mark = h2+HEADER_END_LENGTH;
+      // date...
+      String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
+      if (dateStr != null) {
+        date = trecSrc.parseDate(dateStr);
+      }
+      // title...
+      title = extract(docBuf, TI, TI_END, h2, null);
+    }
+    docData.clear();
+    docData.setName(name);
+    docData.setDate(date);
+    docData.setTitle(title);
+    docData.setBody(stripTags(docBuf, mark).toString());
+    return docData;
+  }
+
+}

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java Sat May 14 13:51:35 2011
@@ -1,66 +1,66 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FR94 docs in trec disks 4+5 collection format
- */
-public class TrecFR94Parser extends TrecDocParser {
-
-  private static final String TEXT = "<TEXT>";
-  private static final int TEXT_LENGTH = TEXT.length();
-  private static final String TEXT_END = "</TEXT>";
-  
-  private static final String DATE = "<DATE>";
-  private static final String[] DATE_NOISE_PREFIXES = {
-    "DATE:",
-    "date:", //TODO improve date extraction for this format
-    "t.c.",
-  };
-  private static final String DATE_END = "</DATE>";
-  
-  //TODO can we also extract title for this format?
-  
-  @Override
-  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    int mark = 0; // that much is skipped
-    // optionally skip some of the text, set date (no title?)
-    Date date = null;
-    int h1 = docBuf.indexOf(TEXT);
-    if (h1>=0) {
-      int h2 = docBuf.indexOf(TEXT_END,h1);
-      mark = h1+TEXT_LENGTH;
-      // date...
-      String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
-      if (dateStr != null) {
-        dateStr = stripTags(dateStr,0).toString();
-        date = trecSrc.parseDate(dateStr.trim());
-      }
-    }
-    docData.clear();
-    docData.setName(name);
-    docData.setDate(date);
-    docData.setBody(stripTags(docBuf, mark).toString());
-    return docData;
-  }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FR94 docs in trec disks 4+5 collection format
+ */
+public class TrecFR94Parser extends TrecDocParser {
+
+  private static final String TEXT = "<TEXT>";
+  private static final int TEXT_LENGTH = TEXT.length();
+  private static final String TEXT_END = "</TEXT>";
+  
+  private static final String DATE = "<DATE>";
+  private static final String[] DATE_NOISE_PREFIXES = {
+    "DATE:",
+    "date:", //TODO improve date extraction for this format
+    "t.c.",
+  };
+  private static final String DATE_END = "</DATE>";
+  
+  //TODO can we also extract title for this format?
+  
+  @Override
+  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
+      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+    int mark = 0; // that much is skipped
+    // optionally skip some of the text, set date (no title?)
+    Date date = null;
+    int h1 = docBuf.indexOf(TEXT);
+    if (h1>=0) {
+      int h2 = docBuf.indexOf(TEXT_END,h1);
+      mark = h1+TEXT_LENGTH;
+      // date...
+      String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
+      if (dateStr != null) {
+        dateStr = stripTags(dateStr,0).toString();
+        date = trecSrc.parseDate(dateStr.trim());
+      }
+    }
+    docData.clear();
+    docData.setName(name);
+    docData.setDate(date);
+    docData.setBody(stripTags(docBuf, mark).toString());
+    return docData;
+  }
+
+}

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java Sat May 14 13:51:35 2011
@@ -1,57 +1,57 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FT docs in trec disks 4+5 collection format
- */
-public class TrecFTParser extends TrecDocParser {
-
-  private static final String DATE = "<DATE>";
-  private static final String DATE_END = "</DATE>";
-  
-  private static final String HEADLINE = "<HEADLINE>";
-  private static final String HEADLINE_END = "</HEADLINE>";
-
-  @Override
-  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    int mark = 0; // that much is skipped
-
-    // date...
-    Date date = null;
-    String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
-    if (dateStr != null) {
-      date = trecSrc.parseDate(dateStr);
-    }
-     
-    // title...
-    String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
-
-    docData.clear();
-    docData.setName(name);
-    docData.setDate(date);
-    docData.setTitle(title);
-    docData.setBody(stripTags(docBuf, mark).toString());
-    return docData;
-  }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FT docs in trec disks 4+5 collection format
+ */
+public class TrecFTParser extends TrecDocParser {
+
+  private static final String DATE = "<DATE>";
+  private static final String DATE_END = "</DATE>";
+  
+  private static final String HEADLINE = "<HEADLINE>";
+  private static final String HEADLINE_END = "</HEADLINE>";
+
+  @Override
+  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
+      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+    int mark = 0; // that much is skipped
+
+    // date...
+    Date date = null;
+    String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
+    if (dateStr != null) {
+      date = trecSrc.parseDate(dateStr);
+    }
+     
+    // title...
+    String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
+
+    docData.clear();
+    docData.setName(name);
+    docData.setDate(date);
+    docData.setTitle(title);
+    docData.setBody(stripTags(docBuf, mark).toString());
+    return docData;
+  }
+
+}

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java Sat May 14 13:51:35 2011
@@ -1,71 +1,71 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Date;
-
-/**
- * Parser for the FT docs in trec disks 4+5 collection format
- */
-public class TrecLATimesParser extends TrecDocParser {
-
-  private static final String DATE = "<DATE>";
-  private static final String DATE_END = "</DATE>";
-  private static final String DATE_NOISE = "day,"; // anything aftre the ',' 
-
-  private static final String SUBJECT = "<SUBJECT>";
-  private static final String SUBJECT_END = "</SUBJECT>";
-  private static final String HEADLINE = "<HEADLINE>";
-  private static final String HEADLINE_END = "</HEADLINE>";
-  
-  @Override
-  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    int mark = 0; // that much is skipped
-
-    // date...
-    Date date = null;
-    String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
-    if (dateStr != null) {
-      int d2a = dateStr.indexOf(DATE_NOISE);
-      if (d2a > 0) {
-        dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
-      }
-      dateStr = stripTags(dateStr,0).toString();
-      date = trecSrc.parseDate(dateStr.trim());
-    }
-     
-    // title... first try with SUBJECT, them with HEADLINE
-    String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
-    if (title==null) {
-      title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
-    }
-    if (title!=null) {
-      title = stripTags(title,0).toString().trim();
-    }
-    
-    docData.clear();
-    docData.setName(name);
-    docData.setDate(date);
-    docData.setTitle(title);
-    docData.setBody(stripTags(docBuf, mark).toString());
-    return docData;
-  }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Parser for the FT docs in trec disks 4+5 collection format
+ */
+public class TrecLATimesParser extends TrecDocParser {
+
+  private static final String DATE = "<DATE>";
+  private static final String DATE_END = "</DATE>";
+  private static final String DATE_NOISE = "day,"; // anything aftre the ',' 
+
+  private static final String SUBJECT = "<SUBJECT>";
+  private static final String SUBJECT_END = "</SUBJECT>";
+  private static final String HEADLINE = "<HEADLINE>";
+  private static final String HEADLINE_END = "</HEADLINE>";
+  
+  @Override
+  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
+      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+    int mark = 0; // that much is skipped
+
+    // date...
+    Date date = null;
+    String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
+    if (dateStr != null) {
+      int d2a = dateStr.indexOf(DATE_NOISE);
+      if (d2a > 0) {
+        dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
+      }
+      dateStr = stripTags(dateStr,0).toString();
+      date = trecSrc.parseDate(dateStr.trim());
+    }
+     
+    // title... first try with SUBJECT, them with HEADLINE
+    String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
+    if (title==null) {
+      title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
+    }
+    if (title!=null) {
+      title = stripTags(title,0).toString().trim();
+    }
+    
+    docData.clear();
+    docData.setName(name);
+    docData.setDate(date);
+    docData.setTitle(title);
+    docData.setBody(stripTags(docBuf, mark).toString());
+    return docData;
+  }
+
+}

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java Sat May 14 13:51:35 2011
@@ -1,33 +1,33 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-/**
- * Parser for trec docs which selects the parser to apply according 
- * to the source files path, defaulting to {@link TrecGov2Parser}.
- */
-public class TrecParserByPath extends TrecDocParser {
-
-  @Override
-  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
-  }
-
-}
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+/**
+ * Parser for trec docs which selects the parser to apply according 
+ * to the source files path, defaulting to {@link TrecGov2Parser}.
+ */
+public class TrecParserByPath extends TrecDocParser {
+
+  @Override
+  public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
+      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+    return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
+  }
+
+}

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Sat May 14 13:51:35 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexDele
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.TieredMergePolicy;
 import org.apache.lucene.index.MergeScheduler;
 import org.apache.lucene.index.ConcurrentMergeScheduler;
 import org.apache.lucene.index.MergePolicy;
@@ -150,6 +151,9 @@ public class CreateIndexTask extends Per
         LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
         logMergePolicy.setUseCompoundFile(isCompound);
         logMergePolicy.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
+      } else if(iwConf.getMergePolicy() instanceof TieredMergePolicy) {
+        TieredMergePolicy tieredMergePolicy = (TieredMergePolicy) iwConf.getMergePolicy();
+        tieredMergePolicy.setUseCompoundFile(isCompound);
       }
     }
     final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java Sat May 14 13:51:35 2011
@@ -43,7 +43,8 @@ import org.apache.lucene.document.Field;
  * <p>
  * The format of the output is set according to the output file extension.
  * Compression is recommended when the output file is expected to be large.
- * See info on file extensions in {@link StreamUtils.Type}
+ * See info on file extensions in
+ * {@link org.apache.lucene.benchmark.byTask.utils.StreamUtils.Type}
  * <p> 
  * Supports the following parameters:
  * <ul>

Modified: lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1103112&r1=1103111&r2=1103112&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/flexscoring/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Sat May 14 13:51:35 2011
@@ -96,7 +96,7 @@ public class TestPerfTasksLogic extends 
     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
     // now we should be able to open the index for write. 
     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
-        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
             .setOpenMode(OpenMode.APPEND));
     iw.close();
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
@@ -183,7 +183,7 @@ public class TestPerfTasksLogic extends 
 
     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
     // now we should be able to open the index for write.
-    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
     iw.close();
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
     assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
@@ -222,7 +222,7 @@ public class TestPerfTasksLogic extends 
 
     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
     // now we should be able to open the index for write.
-    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
     iw.close();
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
     assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
@@ -295,7 +295,7 @@ public class TestPerfTasksLogic extends 
     assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
     // now we should be able to open the index for write. 
-    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
     iw.close();
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
     assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
@@ -407,7 +407,7 @@ public class TestPerfTasksLogic extends 
     // Index the line docs
     String algLines2[] = {
       "# ----- properties ",
-      "analyzer=org.apache.lucene.analysis.MockAnalyzer",
+      "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
       "content.source.forever=false",
@@ -425,7 +425,7 @@ public class TestPerfTasksLogic extends 
 
     // now we should be able to open the index for write. 
     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
-        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
             .setOpenMode(OpenMode.APPEND));
     iw.close();
 
@@ -448,7 +448,7 @@ public class TestPerfTasksLogic extends 
     // then build index from the same docs
     String algLines1[] = {
       "# ----- properties ",
-      "analyzer=org.apache.lucene.analysis.MockAnalyzer",
+      "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "# ----- alg ",
@@ -1021,18 +1021,18 @@ public class TestPerfTasksLogic extends 
                                       "two three four", "three four", 
                                       "three four five", "four five",
                                       "four five six", "five six" });
-    // MockAnalyzer, default maxShingleSize and outputUnigrams
+    // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
     benchmark = execBenchmark
-      (getShingleConfig("analyzer:MockAnalyzer"));
+      (getShingleConfig("analyzer:WhitespaceAnalyzer"));
     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
                        new String[] { "one,two,three,", "one,two,three, four",
                                       "four", "four five", "five", "five six", 
                                       "six" });
     
-    // MockAnalyzer, maxShingleSize=3 and outputUnigrams=false
+    // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
     benchmark = execBenchmark
       (getShingleConfig
-        ("outputUnigrams:false,maxShingleSize:3,analyzer:MockAnalyzer"));
+        ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
                        new String[] { "one,two,three, four", 
                                       "one,two,three, four five",