You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/02/09 10:36:03 UTC
svn commit: r1068809 [22/36] - in /lucene/dev/branches/docvalues: ./ dev-tools/eclipse/ dev-tools/idea/.idea/ dev-tools/idea/.idea/copyright/ dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/ dev-tools/idea/lucene/contrib/queryparser/ dev-tools...

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Wed Feb  9 09:35:27 2011
@@ -45,14 +45,6 @@ import org.apache.lucene.util.AttributeS
  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  * </ul>
- * <b>WARNING</b>: Because JFlex does not support Unicode supplementary 
- * characters (characters above the Basic Multilingual Plane, which contains
- * those up to and including U+FFFF), this scanner will not recognize them
- * properly.  If you need to be able to process text containing supplementary 
- * characters, consider using the ICU4J-backed implementation in modules/analysis/icu  
- * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
- * instead of this class, since the ICU4J-backed implementation does not have
- * this limitation.
  */
 %%
 
@@ -70,15 +62,30 @@ import org.apache.lucene.util.AttributeS
   super(in);
 %init}
 
+
+%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
+Format =  ([\p{WB:Format}] | {FormatSupp})
+Numeric = ([\p{WB:Numeric}] | {NumericSupp})
+Extend =  ([\p{WB:Extend}] | {ExtendSupp})
+Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
+MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
+MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
+MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
+ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
+ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
+Han = ([\p{Script:Han}] | {HanSupp})
+Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
-ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
+ALetterEx      = {ALetter}                     ({Format} | {Extend})*
 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
-NumericEx      = [\p{WB:Numeric}\uFF10-\uFF19]      [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx     = \p{WB:Katakana}                    [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx    = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
+NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
+KatakanaEx     = {Katakana}                    ({Format} | {Extend})* 
+MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})* 
+MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
+ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
 
 
 // URL and E-mail syntax specifications:
@@ -348,12 +355,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 //
 //    http://www.unicode.org/reports/tr14/#SA
 //
-\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
+{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
 
 // UAX#29 WB14.  Any Ã· Any
 //
-\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
-\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
+{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
+{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
 
 
 // UAX#29 WB3.   CR Ã LF

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java Wed Feb  9 09:35:27 2011
@@ -46,6 +46,9 @@ public final class SynonymFilter extends
 
   public SynonymFilter(TokenStream in, SynonymMap map) {
     super(in);
+    if (map == null)
+      throw new IllegalArgumentException("map is required");
+
     this.map = map;
     // just ensuring these attributes exist...
     addAttribute(CharTermAttribute.class);

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java Wed Feb  9 09:35:27 2011
@@ -78,6 +78,7 @@ public class SynonymMap {
   }
 
 
+  @Override
   public String toString() {
     StringBuilder sb = new StringBuilder("<");
     if (synonyms!=null) {

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java Wed Feb  9 09:35:27 2011
@@ -57,6 +57,25 @@ public class StemmerUtil {
   }
   
   /**
+   * Returns true if the character array ends with the suffix.
+   * 
+   * @param s Input Buffer
+   * @param len length of input buffer
+   * @param suffix Suffix string to test
+   * @return true if <code>s</code> ends with <code>suffix</code>
+   */
+  public static boolean endsWith(char s[], int len, char suffix[]) {
+    final int suffixLen = suffix.length;
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix[i])
+        return false;
+    
+    return true;
+  }
+  
+  /**
    * Delete a character in-place
    * 
    * @param s Input Buffer

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Wed Feb  9 09:35:27 2011
@@ -169,7 +169,7 @@ public class HTMLStripCharFilterTest ext
 
   public void testBufferOverflow() throws Exception {
     StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
-    testBuilder.append("ah<?> ");
+    testBuilder.append("ah<?> ??????");
     appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
     processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
 

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Wed Feb  9 09:35:27 2011
@@ -201,4 +201,10 @@ public class TestStandardAnalyzer extend
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);
   }
+  
+  public void testSupplementary() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "ð©¬è±éä¹æ¯ç", 
+        new String[] {"ð©¬", "è±", "é", "ä¹", "æ¯", "ç"},
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Wed Feb  9 09:35:27 2011
@@ -400,4 +400,10 @@ public class TestUAX29URLEmailTokenizer 
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);
   }
+  
+  public void testSupplementary() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "ð©¬è±éä¹æ¯ç", 
+        new String[] {"ð©¬", "è±", "é", "ä¹", "æ¯", "ç"},
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java Wed Feb  9 09:35:27 2011
@@ -1,5 +1,22 @@
 package org.apache.lucene.analysis.el;
 
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java Wed Feb  9 09:35:27 2011
@@ -35,16 +35,26 @@ public class TestKeepWordFilter extends 
     words.add( "aaa" );
     words.add( "bbb" );
     
-    String input = "aaa BBB ccc ddd EEE";
+    String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
     
     // Test Stopwords
     TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
-    stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
-    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
+    stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
        
     // Now force case
     stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
-    stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
-    assertTokenStreamContents(stream, new String[] { "aaa" });
+    stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
+    
+    // Test Stopwords
+    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+    stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
+       
+    // Now force case
+    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+    stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
   }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java Wed Feb  9 09:35:27 2011
@@ -2,6 +2,7 @@ package org.apache.lucene.analysis.misce
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Set;
@@ -57,6 +58,19 @@ public class TestKeywordMarkerFilter ext
             "The quIck browN LuceneFox Jumps")), set2)), output);
   }
 
+  // LUCENE-2901
+  public void testComposition() throws Exception {   
+    TokenStream ts = new LowerCaseFilterMock(
+                     new KeywordMarkerFilter(
+                     new KeywordMarkerFilter(
+                     new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+                     new StringReader("Dogs Trees Birds Houses")),
+                     new HashSet<String>(Arrays.asList(new String[] { "Birds", "Houses" }))), 
+                     new HashSet<String>(Arrays.asList(new String[] { "Dogs", "Trees" }))));
+    
+    assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
+  }
+  
   public static final class LowerCaseFilterMock extends TokenFilter {
 
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java Wed Feb  9 09:35:27 2011
@@ -24,19 +24,24 @@ import java.io.StringReader;
 
 public class TestLengthFilter extends BaseTokenStreamTestCase {
   
-  public void testFilter() throws Exception {
+  public void testFilterNoPosIncr() throws Exception {
     TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, 
         new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
-    LengthFilter filter = new LengthFilter(stream, 2, 6);
-    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    LengthFilter filter = new LengthFilter(false, stream, 2, 6);
+    assertTokenStreamContents(filter,
+      new String[]{"short", "ab", "foo"},
+      new int[]{1, 1, 1}
+    );
+  }
 
-    assertTrue(filter.incrementToken());
-    assertEquals("short", termAtt.toString());
-    assertTrue(filter.incrementToken());
-    assertEquals("ab", termAtt.toString());
-    assertTrue(filter.incrementToken());
-    assertEquals("foo", termAtt.toString());
-    assertFalse(filter.incrementToken());
+  public void testFilterWithPosIncr() throws Exception {
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, 
+        new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
+    LengthFilter filter = new LengthFilter(true, stream, 2, 6);
+    assertTokenStreamContents(filter,
+      new String[]{"short", "ab", "foo"},
+      new int[]{1, 4, 2}
+    );
   }
 
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java Wed Feb  9 09:35:27 2011
@@ -22,8 +22,16 @@ import java.io.StringReader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
 
 public class TestLimitTokenCountAnalyzer extends BaseTokenStreamTestCase {
 
@@ -39,4 +47,26 @@ public class TestLimitTokenCountAnalyzer
     assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
   }
 
+  public void testLimitTokenCountIndexWriter() throws IOException {
+    Directory dir = newDirectory();
+
+    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, new LimitTokenCountAnalyzer(new MockAnalyzer(), 100000)));
+
+    Document doc = new Document();
+    StringBuilder b = new StringBuilder();
+    for(int i=0;i<10000;i++)
+      b.append(" a");
+    b.append(" x");
+    doc.add(newField("field", b.toString(), Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    writer.close();
+
+    IndexReader reader = IndexReader.open(dir, true);
+    Term t = new Term("field", "x");
+    assertEquals(1, reader.docFreq(t));
+    reader.close();
+    dir.close();
+  }
+
 }
\ No newline at end of file

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java Wed Feb  9 09:35:27 2011
@@ -47,6 +47,7 @@ public class TestRemoveDuplicatesTokenFi
           CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
           OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
           PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+          @Override
           public boolean incrementToken() {
             if (toks.hasNext()) {
               clearAttributes();

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java Wed Feb  9 09:35:27 2011
@@ -87,6 +87,7 @@ public class TestTrimFilter extends Base
       this(tokens.toArray(new Token[tokens.size()]));
     }
     
+    @Override
     public boolean incrementToken() throws IOException {
       if (index >= tokens.length)
         return false;

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Wed Feb  9 09:35:27 2011
@@ -213,6 +213,7 @@ public class TestWordDelimiterFilter ext
     
     /* analyzer that uses whitespace + wdf */
     Analyzer a = new Analyzer() {
+      @Override
       public TokenStream tokenStream(String field, Reader reader) {
         return new WordDelimiterFilter(
             new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
@@ -239,6 +240,7 @@ public class TestWordDelimiterFilter ext
     
     /* analyzer that will consume tokens with large position increments */
     Analyzer a2 = new Analyzer() {
+      @Override
       public TokenStream tokenStream(String field, Reader reader) {
         return new WordDelimiterFilter(
             new LargePosIncTokenFilter(
@@ -271,6 +273,7 @@ public class TestWordDelimiterFilter ext
         new int[] { 1, 11, 1 });
 
     Analyzer a3 = new Analyzer() {
+      @Override
       public TokenStream tokenStream(String field, Reader reader) {
         StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
             new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java Wed Feb  9 09:35:27 2011
@@ -76,7 +76,10 @@ public class QueryAutoStopWordAnalyzerTe
   private int search(Analyzer a, String queryString) throws IOException, ParseException {
     QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "repetitiveField", a);
     Query q = qp.parse(queryString);
-    return new IndexSearcher(reader).search(q, null, 1000).totalHits;
+    IndexSearcher searcher = newSearcher(reader);
+    int hits = searcher.search(q, null, 1000).totalHits;
+    searcher.close();
+    return hits;
   }
 
   public void testUninitializedAnalyzer() throws Exception {

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java Wed Feb  9 09:35:27 2011
@@ -395,6 +395,7 @@ public class TestSynonymFilter extends B
       this(tokens.toArray(new Token[tokens.size()]));
     }
     
+    @Override
     public boolean incrementToken() throws IOException {
       if (index >= tokens.length)
         return false;

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Wed Feb  9 09:35:27 2011
@@ -141,7 +141,7 @@ public abstract class CollationTestBase 
     writer.close();
 
     IndexReader reader = IndexReader.open(farsiIndex, true);
-    IndexSearcher search = new IndexSearcher(reader);
+    IndexSearcher search = newSearcher(reader);
         
     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
     // orders the U+0698 character before the U+0633 character, so the single

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/build.xml?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/build.xml Wed Feb  9 09:35:27 2011
@@ -49,6 +49,7 @@
   <path id="test.classpath">
   	<pathelement path="${analyzers-common.jar}"/>
     <path refid="classpath"/>
+    <pathelement location="../../../lucene/build/classes/test-framework/"/>
     <pathelement location="../../../lucene/build/classes/test/"/>
   	<pathelement location="../build/common/classes/test/"/>
     <path refid="junit-path"/>
@@ -107,6 +108,23 @@ are part of the ICU4C package. See http:
     </java>
   </target>
 			
+  <property name="uax29.supp.macros.output.file" 
+            location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
+
+  <target name="gen-uax29-supp-macros" depends="compile-tools">
+    <java
+      classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
+      dir="."
+      fork="true"
+      failonerror="true"
+      output="${uax29.supp.macros.output.file}">
+      <classpath>
+      	<path refid="additional.dependencies"/>
+      	<pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+			
   <target name="compile-tools">
     <compile
       srcdir="src/tools/java"

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java Wed Feb  9 09:35:27 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.icu.t
 import java.io.Serializable;
 
 import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
 
 import com.ibm.icu.lang.UScript;
 
@@ -77,7 +78,7 @@ public class ScriptAttributeImpl extends
   }
 
   @Override
-  public String toString() {
-    return "script=" + getName();
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(ScriptAttribute.class, "script", getName());
   }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/phonetic/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/phonetic/build.xml?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/phonetic/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/analysis/phonetic/build.xml Wed Feb  9 09:35:27 2011
@@ -48,6 +48,7 @@
   <path id="test.classpath">
   	<pathelement path="${analyzers-common.jar}"/>
     <path refid="classpath"/>
+    <pathelement location="../../../lucene/build/classes/test-framework/"/>
     <pathelement location="../../../lucene/build/classes/test/"/>
   	<pathelement location="../build/common/classes/test/"/>
     <path refid="junit-path"/>

Modified: lucene/dev/branches/docvalues/modules/analysis/smartcn/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/smartcn/build.xml?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/smartcn/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/analysis/smartcn/build.xml Wed Feb  9 09:35:27 2011
@@ -25,7 +25,6 @@
 	
   <property name="build.dir" location="../build/smartcn" />
   <property name="dist.dir" location="../dist/smartcn" />
-  <property name="maven.dist.dir" location="../dist/maven" />
 
   <import file="../../../lucene/contrib/contrib-build.xml"/>
 
@@ -40,6 +39,7 @@
   <path id="test.classpath">
   	<pathelement path="${analyzers-common.jar}"/>
     <path refid="classpath"/>
+    <pathelement location="../../../lucene/build/classes/test-framework"/>
     <pathelement location="../../../lucene/build/classes/test/"/>
     <path refid="junit-path"/>
     <pathelement location="${build.dir}/classes/java"/>

Modified: lucene/dev/branches/docvalues/modules/analysis/stempel/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/stempel/build.xml?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/stempel/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/analysis/stempel/build.xml Wed Feb  9 09:35:27 2011
@@ -25,7 +25,6 @@
 	
   <property name="build.dir" location="../build/stempel" />
   <property name="dist.dir" location="../dist/stempel" />
-  <property name="maven.dist.dir" location="../dist/maven" />
 
   <import file="../../../lucene/contrib/contrib-build.xml"/>
 
@@ -39,6 +38,7 @@
 	
   <path id="test.classpath">
     <path refid="classpath"/>
+    <pathelement location="../../../lucene/build/classes/test-framework"/>
     <pathelement location="../../../lucene/build/classes/test/"/>
     <path refid="junit-path"/>
     <pathelement location="${build.dir}/classes/java"/>

Modified: lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt Wed Feb  9 09:35:27 2011
@@ -2,6 +2,21 @@ Lucene Benchmark Contrib Change Log
 
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
+02/05/2011
+  LUCENE-1540: Improvements to contrib.benchmark for TREC collections. 
+  ContentSource can now process plain text files, gzip files, and bzip2 files.
+  TREC doc parsing now handles the TREC gov2 collection and TREC disks 4&5-CR 
+  collection (both used by many TREC tasks). (Shai Erera, Doron Cohen)
+  
+01/26/2011
+  LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That 
+  way, if a previous extract attempt failed, "ant extract-reuters" will still 
+  extract the files. (Shai Erera, Doron Cohen, Grant Ingersoll)
+
+01/24/2011
+  LUCENE-2885: Add WaitForMerges task (calls IndexWriter.waitForMerges()).
+  (Mike McCandless)
+
 10/10/2010
   The locally built patched version of the Xerces-J jar introduced
   as part of LUCENE-1591 is no longer required, because Xerces

Modified: lucene/dev/branches/docvalues/modules/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/build.xml?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/build.xml Wed Feb  9 09:35:27 2011
@@ -7,7 +7,7 @@
 
     <property name="build.dir" location="build/" />
     <property name="dist.dir" location="dist/" />
-    <property name="maven.dist.dir" location="dist/maven" />
+    <property name="maven.dist.dir" location="../dist/maven" />
 
     <import file="../../lucene/contrib/contrib-build.xml"/>
     <property name="working.dir" location="work"/>
@@ -87,7 +87,6 @@
 
     </target>
     <target name="extract-reuters" depends="check-files" unless="reuters.extracted">
-        <mkdir dir="${working.dir}/reuters-out"/>
         <java classname="org.apache.lucene.benchmark.utils.ExtractReuters" maxmemory="1024M" fork="true">
             <classpath refid="run.classpath"/>
             <arg file="${working.dir}/reuters"/>
@@ -260,5 +259,6 @@
     </target>
 
     <target name="init" depends="contrib-build.init,compile-demo,compile-memory,compile-highlighter,compile-analyzers-common"/>
-    
+  
+    <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven"/>
 </project>

Modified: lucene/dev/branches/docvalues/modules/benchmark/conf/createLineFile.alg
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/conf/createLineFile.alg?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/conf/createLineFile.alg (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/conf/createLineFile.alg Wed Feb  9 09:35:27 2011
@@ -29,10 +29,14 @@
 #
 
 # Where to get documents from:
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
 
 # Where to write the line file output:
-line.file.out=work/reuters.lines.txt
+line.file.out=/x/tmp/enwiki.out.txt
+
+docs.file=/x/lucene/data/enwiki/enwiki-20110115-pages-articles.xml
+
+keep.image.only.docs = false
 
 # Stop after processing the document feed once:
 content.source.forever=false

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java Wed Feb  9 09:35:27 2011
@@ -56,11 +56,14 @@ import org.apache.lucene.benchmark.byTas
 public abstract class ContentSource {
   
   private static final int BZIP = 0;
-  private static final int OTHER = 1;
+  private static final int GZIP = 1;
+  private static final int OTHER = 2;
   private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>();
   static {
     extensionToType.put(".bz2", Integer.valueOf(BZIP));
     extensionToType.put(".bzip", Integer.valueOf(BZIP));
+    extensionToType.put(".gz", Integer.valueOf(GZIP));
+    extensionToType.put(".gzip", Integer.valueOf(GZIP));
   }
   
   protected static final int BUFFER_SIZE = 1 << 16; // 64K
@@ -78,11 +81,13 @@ public abstract class ContentSource {
   
   private CompressorStreamFactory csFactory = new CompressorStreamFactory();
 
+  /** update count of bytes generated by this source */  
   protected final synchronized void addBytes(long numBytes) {
     bytesCount += numBytes;
     totalBytesCount += numBytes;
   }
   
+  /** update count of documents generated by this source */  
   protected final synchronized void addDoc() {
     ++docsCount;
     ++totalDocsCount;
@@ -130,21 +135,25 @@ public abstract class ContentSource {
         type = typeInt.intValue();
       }
     }
-    switch (type) {
-      case BZIP:
-        try {
+    
+    try {
+      switch (type) {
+        case BZIP:
           // According to BZip2CompressorInputStream's code, it reads the first 
           // two file header chars ('B' and 'Z'). It is important to wrap the
           // underlying input stream with a buffered one since
           // Bzip2CompressorInputStream uses the read() method exclusively.
           is = csFactory.createCompressorInputStream("bzip2", is);
-        } catch (CompressorException e) {
-          IOException ioe = new IOException(e.getMessage());
-          ioe.initCause(e);
-          throw ioe;
-        }
-        break;
-      default: // Do nothing, stay with FileInputStream
+          break;
+        case GZIP:
+          is = csFactory.createCompressorInputStream("gz", is);
+          break;
+        default: // Do nothing, stay with FileInputStream
+      }
+    } catch (CompressorException e) {
+      IOException ioe = new IOException(e.getMessage());
+      ioe.initCause(e);
+      throw ioe;
     }
     
     return is;

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Wed Feb  9 09:35:27 2011
@@ -29,11 +29,14 @@ import java.util.Properties;
  */
 public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
 
-  public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
+  public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
     org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
     
     // title
-    String title = p.getTitle();
+    if (title==null) {
+      title = p.getTitle();
+    }
+    
     // properties 
     Properties props = p.getMetaTags(); 
     // body

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Wed Feb  9 09:35:27 2011
@@ -29,16 +29,18 @@ public interface HTMLParser {
 
   /**
    * Parse the input Reader and return DocData. 
-   * A provided name or date is used for the result, otherwise an attempt is 
-   * made to set them from the parsed data.
-   * @param dateFormat date formatter to use for extracting the date.   
-   * @param name name of the result doc data. If null, attempt to set by parsed data.
+   * The provided name,title,date are used for the result, unless when they're null, 
+   * in which case an attempt is made to set them from the parsed data.
+   * @param docData result reused
+   * @param name name of the result doc data.
    * @param date date of the result doc data. If null, attempt to set by parsed data.
-   * @param reader of html text to parse.
+   * @param title title of the result doc data. If null, attempt to set by parsed data.
+   * @param reader reader of html text to parse.
+   * @param dateFormat date formatter to use for extracting the date.   
    * @return Parsed doc data.
    * @throws IOException
    * @throws InterruptedException
    */
-  public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+  public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
 
 }

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java Wed Feb  9 09:35:27 2011
@@ -1,3 +1,20 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.lucene.benchmark.byTask.feeds;
 
 import org.apache.lucene.analysis.Analyzer;

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Wed Feb  9 09:35:27 2011
@@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTa
 
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.text.DateFormat;
@@ -29,8 +29,8 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Locale;
-import java.util.zip.GZIPInputStream;
 
+import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
 import org.apache.lucene.util.ThreadInterruptedException;
@@ -46,8 +46,10 @@ import org.apache.lucene.util.ThreadInte
  * <li><b>docs.dir</b> - specifies the directory where the TREC files reside.
  * Can be set to a relative path if "work.dir" is also specified
  * (<b>default=trec</b>).
+ * <li><b>trec.doc.parser</b> - specifies the {@link TrecDocParser} class to use for
+ * parsing the TREC documents content (<b>default=TrecGov2Parser</b>).
  * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
- * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
+ * parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>).
  * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
  * <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
  * </ul>
@@ -59,22 +61,24 @@ public class TrecContentSource extends C
     ParsePosition pos;
   }
 
-  private static final String DATE = "Date: ";
-  private static final String DOCHDR = "<DOCHDR>";
-  private static final String TERMINATING_DOCHDR = "</DOCHDR>";
-  private static final String DOCNO = "<DOCNO>";
-  private static final String TERMINATING_DOCNO = "</DOCNO>";
-  private static final String DOC = "<DOC>";
-  private static final String TERMINATING_DOC = "</DOC>";
+  public static final String DOCNO = "<DOCNO>";
+  public static final String TERMINATING_DOCNO = "</DOCNO>";
+  public static final String DOC = "<DOC>";
+  public static final String TERMINATING_DOC = "</DOC>";
 
-  private static final String NEW_LINE = System.getProperty("line.separator");
+  /** separator between lines in the byffer */ 
+  public static final String NEW_LINE = System.getProperty("line.separator");
 
   private static final String DATE_FORMATS [] = {
-       "EEE, dd MMM yyyy kk:mm:ss z",	  // Tue, 09 Dec 2003 22:39:08 GMT
-       "EEE MMM dd kk:mm:ss yyyy z",  	// Tue Dec 09 16:45:08 2003 EST
-       "EEE, dd-MMM-':'y kk:mm:ss z", 	// Tue, 09 Dec 2003 22:39:08 GMT
-       "EEE, dd-MMM-yyy kk:mm:ss z", 	  // Tue, 09 Dec 2003 22:39:08 GMT
-       "EEE MMM dd kk:mm:ss yyyy",  	  // Tue Dec 09 16:45:08 2003
+       "EEE, dd MMM yyyy kk:mm:ss z",   // Tue, 09 Dec 2003 22:39:08 GMT
+       "EEE MMM dd kk:mm:ss yyyy z",    // Tue Dec 09 16:45:08 2003 EST
+       "EEE, dd-MMM-':'y kk:mm:ss z",   // Tue, 09 Dec 2003 22:39:08 GMT
+       "EEE, dd-MMM-yyy kk:mm:ss z",    // Tue, 09 Dec 2003 22:39:08 GMT
+       "EEE MMM dd kk:mm:ss yyyy",      // Tue Dec 09 16:45:08 2003
+       "dd MMM yyyy",                   // 1 March 1994
+       "MMM dd, yyyy",                  // February 3, 1994
+       "yyMMdd",                        // 910513
+       "hhmm z.z.z. MMM dd, yyyy",       // 0901 u.t.c. April 28, 1994
   };
 
   private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
@@ -83,7 +87,7 @@ public class TrecContentSource extends C
   private File dataDir = null;
   private ArrayList<File> inputFiles = new ArrayList<File>();
   private int nextFile = 0;
-  private int rawDocSize;
+  private int rawDocSize = 0;
 
   // Use to synchronize threads on reading from the TREC documents.
   private Object lock = new Object();
@@ -92,7 +96,10 @@ public class TrecContentSource extends C
   BufferedReader reader;
   int iteration = 0;
   HTMLParser htmlParser;
+  
   private boolean excludeDocnameIteration;
+  private TrecDocParser trecDocParser = new TrecGov2Parser(); // default
+  ParsePathType currPathType; // not private for tests
   
   private DateFormatInfo getDateFormatInfo() {
     DateFormatInfo dfi = dateFormats.get();
@@ -118,7 +125,7 @@ public class TrecContentSource extends C
     return sb;
   }
   
-  private Reader getTrecDocReader(StringBuilder docBuffer) {
+  Reader getTrecDocReader(StringBuilder docBuffer) {
     StringBuilderReader r = trecDocReader.get();
     if (r == null) {
       r = new StringBuilderReader(docBuffer);
@@ -129,10 +136,21 @@ public class TrecContentSource extends C
     return r;
   }
 
-  // read until finding a line that starts with the specified prefix, or a terminating tag has been found.
-  private void read(StringBuilder buf, String prefix, boolean collectMatchLine,
-                    boolean collectAll, String terminatingTag)
-      throws IOException, NoMoreDataException {
+  HTMLParser getHtmlParser() {
+    return htmlParser;
+  }
+  
+  /**
+   * Read until a line starting with the specified <code>lineStart</code>.
+   * @param buf buffer for collecting the data if so specified/ 
+   * @param lineStart line start to look for, must not be null.
+   * @param collectMatchLine whether to collect the matching line into <code>buffer</code>.
+   * @param collectAll whether to collect all lines into <code>buffer</code>.
+   * @throws IOException
+   * @throws NoMoreDataException
+   */
+   private void read(StringBuilder buf, String lineStart, 
+       boolean collectMatchLine, boolean collectAll) throws IOException, NoMoreDataException {
     String sep = "";
     while (true) {
       String line = reader.readLine();
@@ -144,20 +162,12 @@ public class TrecContentSource extends C
 
       rawDocSize += line.length();
 
-      if (line.startsWith(prefix)) {
+      if (lineStart!=null && line.startsWith(lineStart)) {
         if (collectMatchLine) {
           buf.append(sep).append(line);
           sep = NEW_LINE;
         }
-        break;
-      }
-
-      if (terminatingTag != null && line.startsWith(terminatingTag)) {
-        // didn't find the prefix that was asked, but the terminating
-        // tag was found. set the length to 0 to signal no match was
-        // found.
-        buf.setLength(0);
-        break;
+        return;
       }
 
       if (collectAll) {
@@ -169,7 +179,7 @@ public class TrecContentSource extends C
   
   void openNextFile() throws NoMoreDataException, IOException {
     close();
-    int retries = 0;
+    currPathType = null;
     while (true) {
       if (nextFile >= inputFiles.size()) { 
         // exhausted files, start a new round, unless forever set to false.
@@ -184,13 +194,13 @@ public class TrecContentSource extends C
         System.out.println("opening: " + f + " length: " + f.length());
       }
       try {
-        GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE);
-        reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE);
+        InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension  
+        reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE);
+        currPathType = TrecDocParser.pathType(f);
         return;
       } catch (Exception e) {
-        retries++;
-        if (retries < 20 && verbose) {
-          System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + "  #retries=" + retries);
+        if (verbose) {
+          System.out.println("Skipping 'bad' file " + f.getAbsolutePath()+" due to "+e.getMessage());
           continue;
         }
         throw new NoMoreDataException();
@@ -198,7 +208,7 @@ public class TrecContentSource extends C
     }
   }
 
-  Date parseDate(String dateStr) {
+  public Date parseDate(String dateStr) {
     dateStr = dateStr.trim();
     DateFormatInfo dfi = getDateFormatInfo();
     for (int i = 0; i < dfi.dfs.length; i++) {
@@ -237,70 +247,47 @@ public class TrecContentSource extends C
 
   @Override
   public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
-    String dateStr = null, name = null;
-    Reader r = null;
+    String name = null;
+    StringBuilder docBuf = getDocBuffer();
+    ParsePathType parsedPathType;
+    
     // protect reading from the TREC files by multiple threads. The rest of the
-    // method, i.e., parsing the content and returning the DocData can run
-    // unprotected.
+    // method, i.e., parsing the content and returning the DocData can run unprotected.
     synchronized (lock) {
       if (reader == null) {
         openNextFile();
       }
-
-      StringBuilder docBuf = getDocBuffer();
       
-      // 1. skip until doc start
+      // 1. skip until doc start - required for all TREC formats
       docBuf.setLength(0);
-      read(docBuf, DOC, false, false, null);
-
-      // 2. name
+      read(docBuf, DOC, false, false);
+      
+      // save parsedFile for passing trecDataParser after the sync block, in 
+      // case another thread will open another file in between.
+      parsedPathType = currPathType;
+      
+      // 2. name - required for all TREC formats
       docBuf.setLength(0);
-      read(docBuf, DOCNO, true, false, null);
+      read(docBuf, DOCNO, true, false);
       name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
-          DOCNO.length()));
-      if (!excludeDocnameIteration)
+          DOCNO.length())).trim();
+      
+      if (!excludeDocnameIteration) {
         name = name + "_" + iteration;
-
-      // 3. skip until doc header
-      docBuf.setLength(0);
-      read(docBuf, DOCHDR, false, false, null);
-
-      boolean findTerminatingDocHdr = false;
-
-      // 4. date - look for the date only until /DOCHDR
-      docBuf.setLength(0);
-      read(docBuf, DATE, true, false, TERMINATING_DOCHDR);
-      if (docBuf.length() != 0) {
-        // Date found.
-        dateStr = docBuf.substring(DATE.length());
-        findTerminatingDocHdr = true;
       }
 
-      // 5. skip until end of doc header
-      if (findTerminatingDocHdr) {
-        docBuf.setLength(0);
-        read(docBuf, TERMINATING_DOCHDR, false, false, null);
-      }
-
-      // 6. collect until end of doc
+      // 3. read all until end of doc
       docBuf.setLength(0);
-      read(docBuf, TERMINATING_DOC, false, true, null);
-      
-      // 7. Set up a Reader over the read content
-      r = getTrecDocReader(docBuf);
-      // Resetting the thread's reader means it will reuse the instance
-      // allocated as well as re-read from docBuf.
-      r.reset();
-      
-      // count char length of parsed html text (larger than the plain doc body text).
-      addBytes(docBuf.length()); 
+      read(docBuf, TERMINATING_DOC, false, true);
     }
+      
+    // count char length of text to be parsed (may be larger than the resulted plain doc body text).
+    addBytes(docBuf.length()); 
 
     // This code segment relies on HtmlParser being thread safe. When we get 
     // here, everything else is already private to that thread, so we're safe.
-    Date date = dateStr != null ? parseDate(dateStr) : null;
     try {
-      docData = htmlParser.parse(docData, name, date, r, null);
+      docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
       addDoc();
     } catch (InterruptedException ie) {
       throw new ThreadInterruptedException(ie);
@@ -322,27 +309,40 @@ public class TrecContentSource extends C
   @Override
   public void setConfig(Config config) {
     super.setConfig(config);
+    // dirs
     File workDir = new File(config.get("work.dir", "work"));
     String d = config.get("docs.dir", "trec");
     dataDir = new File(d);
     if (!dataDir.isAbsolute()) {
       dataDir = new File(workDir, d);
     }
+    // files
     collectFiles(dataDir, inputFiles);
     if (inputFiles.size() == 0) {
       throw new IllegalArgumentException("No files in dataDir: " + dataDir);
     }
+    // trec doc parser
+    try {
+      String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
+      trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance();
+    } catch (Exception e) {
+      // Should not get here. Throw runtime exception.
+      throw new RuntimeException(e);
+    }
+    // html parser
     try {
-      String parserClassName = config.get("html.parser",
+      String htmlParserClassName = config.get("html.parser",
           "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser");
-      htmlParser = Class.forName(parserClassName).asSubclass(HTMLParser.class).newInstance();
+      htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance();
     } catch (Exception e) {
       // Should not get here. Throw runtime exception.
       throw new RuntimeException(e);
     }
+    // encoding
     if (encoding == null) {
       encoding = "ISO-8859-1";
     }
+    // iteration exclusion in doc name 
     excludeDocnameIteration = config.get("content.source.excludeIteration", false);
   }
 

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Wed Feb  9 09:35:27 2011
@@ -46,8 +46,7 @@ import java.io.PrintStream;
  * Create an index. <br>
  * Other side effects: index writer object in perfRunData is set. <br>
  * Relevant properties: <code>merge.factor (default 10),
- * max.buffered (default no flush), max.field.length (default
- * 10,000 tokens), max.field.length, compound (default true), ram.flush.mb [default 0],
+ * max.buffered (default no flush), compound (default true), ram.flush.mb [default 0],
  * merge.policy (default org.apache.lucene.index.LogByteSizeMergePolicy),
  * merge.scheduler (default
  * org.apache.lucene.index.ConcurrentMergeScheduler),
@@ -153,7 +152,6 @@ public class CreateIndexTask extends Per
         logMergePolicy.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
       }
     }
-    iwConf.setMaxFieldLength(config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH));
     final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
     final int maxBuffered = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
     if (maxBuffered == IndexWriterConfig.DISABLE_AUTO_FLUSH) {

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java Wed Feb  9 09:35:27 2011
@@ -59,7 +59,7 @@ public class NearRealtimeReaderTask exte
     }
     
     long t = System.currentTimeMillis();
-    IndexReader r = IndexReader.open(w);
+    IndexReader r = IndexReader.open(w, true);
     runData.setIndexReader(r);
     // Transfer our reference to runData
     r.decRef();

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java Wed Feb  9 09:35:27 2011
@@ -26,7 +26,6 @@ import org.apache.lucene.index.LogMergeP
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import java.io.IOException;
 
-
 /**
  * Open an index writer.
  * <br>Other side effects: index writer object in perfRunData is set.
@@ -41,7 +40,6 @@ import java.io.IOException;
 public class OpenIndexTask extends PerfTask {
 
   public static final int DEFAULT_MAX_BUFFERED = IndexWriterConfig.DEFAULT_MAX_BUFFERED_DOCS;
-  public static final int DEFAULT_MAX_FIELD_LENGTH = IndexWriterConfig.UNLIMITED_FIELD_LENGTH;
   public static final int DEFAULT_MERGE_PFACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR;
   public static final double DEFAULT_RAM_FLUSH_MB = (int) IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB;
   private String commitUserData;

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java Wed Feb  9 09:35:27 2011
@@ -158,8 +158,10 @@ public class StringBuilderReader extends
     synchronized (lock) {
       this.sb = sb;
       length = sb.length();
+      next = mark = 0;
     }
   }
+  
   @Override
   public long skip(long ns) throws IOException {
     synchronized (lock) {

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java Wed Feb  9 09:35:27 2011
@@ -28,7 +28,7 @@ import org.apache.lucene.search.TopDocs;
 /**
  * Create a log ready for submission.
  * Extend this class and override
- * {@link #report(QualityQuery, TopDocs, String, Searcher)}
+ * {@link #report(QualityQuery, TopDocs, String, IndexSearcher)}
  * to create different reports. 
  */
 public class SubmissionReport {

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java Wed Feb  9 09:35:27 2011
@@ -29,146 +29,119 @@ import java.util.regex.Pattern;
 /**
  * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
  */
-public class ExtractReuters
-{
-    private File reutersDir;
-    private File outputDir;
-    private static final String LINE_SEPARATOR = System.getProperty("line.separator");
-
-    public ExtractReuters(File reutersDir, File outputDir)
-    {
-        this.reutersDir = reutersDir;
-        this.outputDir = outputDir;
-        System.out.println("Deleting all files in " + outputDir);
-        File [] files = outputDir.listFiles();
-        for (int i = 0; i < files.length; i++)
-        {
-            files[i].delete();
-        }
-
+public class ExtractReuters {
+  private File reutersDir;
+  private File outputDir;
+  private static final String LINE_SEPARATOR = System.getProperty("line.separator");
+
+  public ExtractReuters(File reutersDir, File outputDir) {
+    this.reutersDir = reutersDir;
+    this.outputDir = outputDir;
+    System.out.println("Deleting all files in " + outputDir);
+    for (File f : outputDir.listFiles()) {
+      f.delete();
     }
+  }
 
-    public void extract()
-    {
-        File [] sgmFiles = reutersDir.listFiles(new FileFilter()
-        {
-            public boolean accept(File file)
-            {
-                return file.getName().endsWith(".sgm");
-            }
-        });
-        if (sgmFiles != null && sgmFiles.length > 0)
-        {
-            for (int i = 0; i < sgmFiles.length; i++)
-            {
-                File sgmFile = sgmFiles[i];
-                extractFile(sgmFile);
-            }
-        }
-        else
-        {
-            System.err.println("No .sgm files in " + reutersDir);
-        }
+  public void extract() {
+    File[] sgmFiles = reutersDir.listFiles(new FileFilter() {
+      public boolean accept(File file) {
+        return file.getName().endsWith(".sgm");
+      }
+    });
+    if (sgmFiles != null && sgmFiles.length > 0) {
+      for (File sgmFile : sgmFiles) {
+        extractFile(sgmFile);
+      }
+    } else {
+      System.err.println("No .sgm files in " + reutersDir);
     }
+  }
 
-    Pattern EXTRACTION_PATTERN = Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
-
-    private static String[] META_CHARS
-            = {"&", "<", ">", "\"", "'"};
+  Pattern EXTRACTION_PATTERN = Pattern
+      .compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
 
-    private static String[] META_CHARS_SERIALIZATIONS
-            = {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};
+  private static String[] META_CHARS = { "&", "<", ">", "\"", "'" };
 
-    /**
-     * Override if you wish to change what is extracted
-     *
-     * @param sgmFile
-     */
-    protected void extractFile(File sgmFile)
-    {
-        try
-        {
-            BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
-
-            StringBuilder buffer = new StringBuilder(1024);
-            StringBuilder outBuffer = new StringBuilder(1024);
-
-            String line = null;
-            int docNumber = 0;
-            while ((line = reader.readLine()) != null)
-            {
-                //when we see a closing reuters tag, flush the file
-
-                if (line.indexOf("</REUTERS") == -1) {
-                    //Replace the SGM escape sequences
-
-                    buffer.append(line).append(' ');//accumulate the strings for now, then apply regular expression to get the pieces,
-                }
-                else
-                {
-                    //Extract the relevant pieces and write to a file in the output dir
-                    Matcher matcher = EXTRACTION_PATTERN.matcher(buffer);
-                    while (matcher.find())
-                    {
-                        for (int i = 1; i <= matcher.groupCount(); i++)
-                        {
-                            if (matcher.group(i) != null)
-                            {
-                                outBuffer.append(matcher.group(i));
-                            }
-                        }
-                        outBuffer.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
-                    }
-                    String out = outBuffer.toString();
-                    for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; i++)
-                    {
-                        out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
-                    }
-                    File outFile = new File(outputDir, sgmFile.getName() + "-" + (docNumber++) + ".txt");
-                    //System.out.println("Writing " + outFile);
-                    FileWriter writer = new FileWriter(outFile);
-                    writer.write(out);
-                    writer.close();
-                    outBuffer.setLength(0);
-                    buffer.setLength(0);
-                }
+  private static String[] META_CHARS_SERIALIZATIONS = { "&amp;", "&lt;",
+      "&gt;", "&quot;", "&apos;" };
+
+  /**
+   * Override if you wish to change what is extracted
+   * 
+   * @param sgmFile
+   */
+  protected void extractFile(File sgmFile) {
+    try {
+      BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
+
+      StringBuilder buffer = new StringBuilder(1024);
+      StringBuilder outBuffer = new StringBuilder(1024);
+
+      String line = null;
+      int docNumber = 0;
+      while ((line = reader.readLine()) != null) {
+        // when we see a closing reuters tag, flush the file
+
+        if (line.indexOf("</REUTERS") == -1) {
+          // Replace the SGM escape sequences
+
+          buffer.append(line).append(' ');// accumulate the strings for now,
+                                          // then apply regular expression to
+                                          // get the pieces,
+        } else {
+          // Extract the relevant pieces and write to a file in the output dir
+          Matcher matcher = EXTRACTION_PATTERN.matcher(buffer);
+          while (matcher.find()) {
+            for (int i = 1; i <= matcher.groupCount(); i++) {
+              if (matcher.group(i) != null) {
+                outBuffer.append(matcher.group(i));
+              }
             }
-            reader.close();
-        }
-
-        catch (
-                IOException e
-                )
-
-        {
-            throw new RuntimeException(e);
-        }
+            outBuffer.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
+          }
+          String out = outBuffer.toString();
+          for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; i++) {
+            out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
+          }
+          File outFile = new File(outputDir, sgmFile.getName() + "-"
+              + (docNumber++) + ".txt");
+          // System.out.println("Writing " + outFile);
+          FileWriter writer = new FileWriter(outFile);
+          writer.write(out);
+          writer.close();
+          outBuffer.setLength(0);
+          buffer.setLength(0);
+        }
+      }
+      reader.close();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
     }
+  }
 
-
-    public static void main(String[] args)
-    {
-        if (args.length != 2)
-        {
-            printUsage();
-        }
-        File reutersDir = new File(args[0]);
-
-        if (reutersDir.exists())
-        {
-            File outputDir = new File(args[1]);
-            outputDir.mkdirs();
-            ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
-            extractor.extract();
-        }
-        else
-        {
-            printUsage();
-        }
+  public static void main(String[] args) {
+    if (args.length != 2) {
+      printUsage();
     }
-
-    private static void printUsage()
-    {
-        System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+    File reutersDir = new File(args[0]);
+    if (!reutersDir.exists()) {
+      printUsage();
+      return;
     }
+    
+    // First, extract to a tmp directory and only if everything succeeds, rename
+    // to output directory.
+    File outputDir = new File(args[1] + "-tmp");
+    outputDir.mkdirs();
+    ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
+    extractor.extract();
+    // Now rename to requested output dir
+    outputDir.renameTo(new File(args[1]));
+  }
+
+  private static void printUsage() {
+    System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+  }
+  
 }

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Wed Feb  9 09:35:27 2011
@@ -514,6 +514,7 @@ public class TestPerfTasksLogic extends 
         "{ [ AddDoc]: 4} : * ",
         "ResetInputs ",
         "{ [ AddDoc]: 4} : * ",
+        "WaitForMerges",
         "CloseIndex",
     };
     
@@ -549,6 +550,7 @@ public class TestPerfTasksLogic extends 
         "  ResetSystemErase",
         "  CreateIndex",
         "  { \"AddDocs\"  AddDoc > : * ",
+        "  WaitForMerges",
         "  CloseIndex",
         "} : 2",
     };

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java Wed Feb  9 09:35:27 2011
@@ -18,14 +18,20 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import java.io.BufferedReader;
+import java.io.File;
 import java.io.IOException;
 import java.io.StringReader;
 import java.text.ParseException;
+import java.util.Arrays;
 import java.util.Date;
+import java.util.HashSet;
+import java.util.Properties;
 
+import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
 
 public class TrecContentSourceTest extends LuceneTestCase {
 
@@ -329,5 +335,62 @@ public class TrecContentSourceTest exten
     // Don't test that NoMoreDataException is thrown, since the forever flag is
     // turned on.
   }
+  
+  /** 
+   * Open a trec content source over a directory with files of all trec path types and all
+   * supported formats - bzip, gzip, txt. 
+   */
+  public void testTrecFeedDirAllTypes() throws Exception {
+    File dataDir =  _TestUtil.getTempDir("trecFeedAllTypes");
+    _TestUtil.unzip(getDataFile("trecdocs.zip"), dataDir);
+    TrecContentSource tcs = new TrecContentSource();
+    Properties props = new Properties();
+    props.setProperty("print.props", "false");
+    props.setProperty("content.source.verbose", "false");
+    props.setProperty("content.source.excludeIteration", "true");
+    props.setProperty("doc.maker.forever", "false");
+    props.setProperty("docs.dir", dataDir.getCanonicalPath().replace('\\','/')); 
+    props.setProperty("trec.doc.parser", TrecParserByPath.class.getName());
+    props.setProperty("content.source.forever", "false");
+    tcs.setConfig(new Config(props));
+    tcs.resetInputs();
+    DocData dd = new DocData();
+    int n = 0;
+    boolean gotExpectedException = false;
+    HashSet<ParsePathType> unseenTypes = new HashSet<ParsePathType>(Arrays.asList(ParsePathType.values()));
+    try {
+      while (n<100) { // arbiterary limit to prevent looping forever in case of test failure
+        dd = tcs.getNextDocData(dd);
+        ++n;
+        assertNotNull("doc data "+n+" should not be null!", dd);
+        unseenTypes.remove(tcs.currPathType);
+        switch(tcs.currPathType) {
+          case GOV2:
+            assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+            break;
+          case FBIS:
+            assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991"));
+            break;
+          case FR94:
+            // no title extraction in this source for now
+            assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994"));
+            break;
+          case FT:
+            assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424"));
+            break;
+          case LATIMES:
+            assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday"));
+            break;
+          default:
+            assertTrue("Should never get here!", false);
+        }
+      }
+    } catch (NoMoreDataException e) {
+      gotExpectedException = true;
+    }
+    assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
+    assertEquals("Wrong numbre of documents created by osurce!",5,n);
+    assertTrue("Did not see all types!",unseenTypes.isEmpty());
+  }
 
 }

Modified: lucene/dev/branches/docvalues/modules/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/build.xml?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/build.xml Wed Feb  9 09:35:27 2011
@@ -56,6 +56,7 @@
 
   <target name="generate-maven-artifacts" description="Generate Maven Artifacts for Modules">
     <sequential>
+      <ant target="get-maven-poms" dir=".."/>
       <subant target="dist-maven" inheritall="false" failonerror="true">
         <fileset dir="analysis" includes="build.xml" />
         <fileset dir="benchmark" includes="build.xml" />
@@ -65,6 +66,7 @@
 
   <target name="clean" description="Clean all modules">
     <sequential>
+      <delete dir="dist"/>
       <subant target="clean" inheritall="false" failonerror="true">
         <fileset dir="analysis" includes="build.xml" />
         <fileset dir="benchmark" includes="build.xml" />