You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/07/19 17:59:32 UTC

svn commit: r1363400 [6/31] - in /lucene/dev/branches/pforcodec_3892: ./ dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/ dev-tools/idea/.idea/copyright/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/ dev-tools/maven/ dev-tools/maven/lucene/...

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Thu Jul 19 15:58:54 2012
@@ -1,4 +1,3 @@
-// -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 
 /*
@@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfo
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.TreeSet;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 /**
@@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer exte
   public final void testSingleTokens() throws IOException {
     Analyzer a = getTestAnalyzer();
     assertAnalyzesToReuse(a, "a", new String[] { "a" });
-    assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
-    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
+    assertAnalyzesToReuse(a, "liście", new String[] { "liście", "liść", "list", "lista" });
+    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
     assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
   }
 
@@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer exte
     assertAnalyzesToReuse(
       a,
       "liście danych",
-      new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
-      new int[] { 0, 0, 0, 7, 7, 7 },
-      new int[] { 6, 6, 6, 13, 13, 13 },
-      new int[] { 1, 0, 0, 1, 0, 0 });
+      new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
+      new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
+      new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
+      new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
   }
 
   /** Test reuse of MorfologikFilter with leftover stems. */
@@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer exte
     CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
     ts_1.reset();
     ts_1.incrementToken();
-    assertEquals("first stream", "liść", termAtt_1.toString());
+    assertEquals("first stream", "liście", termAtt_1.toString());
 
     TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
     CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
@@ -76,35 +74,63 @@ public class TestMorfologikAnalyzer exte
   public final void testCase() throws IOException {
     Analyzer a = getTestAnalyzer();
 
-    assertAnalyzesToReuse(a, "AGD",      new String[] { "artykuły gospodarstwa domowego" });
+    assertAnalyzesToReuse(a, "AGD",      new String[] { "AGD", "artykuły gospodarstwa domowego" });
     assertAnalyzesToReuse(a, "agd",      new String[] { "artykuły gospodarstwa domowego" });
 
     assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
-    assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
+    assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznać" });
 
     assertAnalyzesToReuse(a, "Aarona",   new String[] { "Aaron" });
     assertAnalyzesToReuse(a, "aarona",   new String[] { "aarona" });
 
-    assertAnalyzesToReuse(a, "Liście",   new String[] { "liść", "list", "lista" });
+    assertAnalyzesToReuse(a, "Liście",   new String[] { "liście", "liść", "list", "lista" });
   }
 
-  private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
+  private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
     ts.incrementToken();
     assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
-    assertEquals(pos,  ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+    
+    TreeSet<String> actual = new TreeSet<String>();
+    TreeSet<String> expected = new TreeSet<String>();
+    for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
+      actual.add(b.toString());
+    }
+    for (String s : tags) {
+      expected.add(s);
+    }
+    
+    if (!expected.equals(actual)) {
+      System.out.println("Expected:\n" + expected);
+      System.out.println("Actual:\n" + actual);
+      assertEquals(expected, actual);
+    }
   }
 
   /** Test morphosyntactic annotations. */
   public final void testPOSAttribute() throws IOException {
     TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
 
-    assertPOSToken(ts, "liść",  "subst:pl:acc.nom.voc:m3");
-    assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
-    assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+    assertPOSToken(ts, "liście",  
+        "subst:sg:acc:n2",
+        "subst:sg:nom:n2",
+        "subst:sg:voc:n2");
+
+    assertPOSToken(ts, "liść",  
+        "subst:pl:acc:m3",
+        "subst:pl:nom:m3",
+        "subst:pl:voc:m3");
+
+    assertPOSToken(ts, "list",  
+        "subst:sg:loc:m3",
+        "subst:sg:voc:m3");
+
+    assertPOSToken(ts, "lista", 
+        "subst:sg:dat:f",
+        "subst:sg:loc:f");
   }
-  
+
   /** blast some random strings through the analyzer */
   public void testRandom() throws Exception {
-    checkRandomData(random(), getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER); 
+    checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER); 
   }
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java Thu Jul 19 15:58:54 2012
@@ -62,9 +62,13 @@ public final class BeiderMorseFilter ext
   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   
-  /** 
-   * Calls {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, Languages.LanguageSet)
-   *        BeiderMorseFilter(input, engine, null)}
+  
+  /**
+   * Calls
+   * {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, org.apache.commons.codec.language.bm.Languages.LanguageSet)}
+   * 
+   * @param input
+   * @param engine
    */
   public BeiderMorseFilter(TokenStream input, PhoneticEngine engine) {
     this(input, engine, null);

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java Thu Jul 19 15:58:54 2012
@@ -27,9 +27,8 @@ import java.io.IOException;
 
 /**
  * Create tokens for phonetic matches.
- * @see <a href="
- * http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
- * ">Apache Commons Codec</a>
+ * @see <a href="http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html">
+ * Apache Commons Codec</a>
  */
 public final class PhoneticFilter extends TokenFilter 
 {

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Thu Jul 19 15:58:54 2012
@@ -66,12 +66,9 @@ public class AnalyzerProfile {
 
     if (ANALYSIS_DATA_DIR.length() == 0) {
       // Dictionary directory cannot be found.
-      System.err
-          .println("WARNING: Can not find lexical dictionary directory!");
-      System.err
-          .println("WARNING: This will cause unpredictable exceptions in your application!");
-      System.err
-          .println("WARNING: Please refer to the manual to download the dictionaries.");
+      throw new RuntimeException("WARNING: Can not find lexical dictionary directory!"
+       + " This will cause unpredictable exceptions in your application!"
+       + " Please refer to the manual to download the dictionaries.");
     }
 
   }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Thu Jul 19 15:58:54 2012
@@ -117,13 +117,13 @@ public final class SentenceTokenizer ext
   }
 
   @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
+  public void setReader(Reader input) throws IOException {
+    super.setReader(input);
     reset();
   }
 
   @Override
-  public void end() throws IOException {
+  public void end() {
     // set final offset
     final int finalOffset = correctOffset(tokenEnd);
     offsetAtt.setOffset(finalOffset, finalOffset);

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Thu Jul 19 15:58:54 2012
@@ -118,9 +118,8 @@ abstract class AbstractDictionary {
       											// Therefore, each code page only has 16*6-2=94 characters.
       return (short) (b0 * 94 + b1);
     } catch (UnsupportedEncodingException e) {
-      e.printStackTrace();
+      throw new RuntimeException(e);
     }
-    return -1;
   }
 
   /**

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Thu Jul 19 15:58:54 2012
@@ -79,14 +79,9 @@ class BigramDictionary extends AbstractD
     try {
       loadFromInputStream(new FileInputStream(serialObj));
       return true;
-    } catch (FileNotFoundException e) {
-      e.printStackTrace();
-    } catch (IOException e) {
-      e.printStackTrace();
-    } catch (ClassNotFoundException e) {
-      e.printStackTrace();
+    } catch (Exception e) {
+      throw new RuntimeException(e);
     }
-    return false;
   }
 
   private void loadFromInputStream(InputStream serialObjectInputStream)
@@ -148,8 +143,7 @@ class BigramDictionary extends AbstractD
    * @throws IOException
    * @throws UnsupportedEncodingException
    */
-  public void loadFromFile(String dctFilePath) throws FileNotFoundException,
-      IOException, UnsupportedEncodingException {
+  public void loadFromFile(String dctFilePath) throws IOException {
 
     int i, cnt, length, total = 0;
     // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.  

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Thu Jul 19 15:58:54 2012
@@ -145,14 +145,9 @@ class WordDictionary extends AbstractDic
     try {
       loadFromObjectInputStream(new FileInputStream(serialObj));
       return true;
-    } catch (FileNotFoundException e) {
-      e.printStackTrace();
-    } catch (IOException e) {
-      e.printStackTrace();
-    } catch (ClassNotFoundException e) {
-      e.printStackTrace();
+    } catch (Exception e) {
+      throw new RuntimeException(e);
     }
-    return false;
   }
 
   private void loadFromObjectInputStream(InputStream serialObjectInputStream)
@@ -190,8 +185,7 @@ class WordDictionary extends AbstractDic
    * @throws IOException
    * @throws UnsupportedEncodingException
    */
-  private int loadMainDataFromFile(String dctFilePath)
-      throws FileNotFoundException, IOException, UnsupportedEncodingException {
+  private int loadMainDataFromFile(String dctFilePath) throws IOException {
     int i, cnt, length, total = 0;
     // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
     // The 3756th is used (as a header) to store information.

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Thu Jul 19 15:58:54 2012
@@ -224,13 +224,13 @@ public class TestSmartChineseAnalyzer ex
   
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
-    checkRandomData(random(), new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random(), new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
   }
   
   /** blast some random large strings through the analyzer */
   public void testRandomHugeStrings() throws Exception {
     Random random = random();
-    checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
   }
   
   public void testEmptyTerm() throws IOException {

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java Thu Jul 19 15:58:54 2012
@@ -65,7 +65,7 @@ public class StempelStemmer {
     DataInputStream in = null;
     try {
       in = new DataInputStream(new BufferedInputStream(stemmerTable));
-      String method = in.readUTF().toUpperCase(Locale.ENGLISH);
+      String method = in.readUTF().toUpperCase(Locale.ROOT);
       if (method.indexOf('M') < 0) {
         return new org.egothor.stemmer.Trie(in);
       } else {

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java Thu Jul 19 15:58:54 2012
@@ -63,6 +63,7 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
+import java.util.Locale;
 import java.util.StringTokenizer;
 
 /**
@@ -84,12 +85,12 @@ public class Compile {
    * 
    * @param args the command line arguments
    */
-  public static void main(java.lang.String[] args) {
+  public static void main(java.lang.String[] args) throws Exception {
     if (args.length < 1) {
       return;
     }
     
-    args[0].toUpperCase();
+    args[0].toUpperCase(Locale.ROOT);
     
     backward = args[0].charAt(0) == '-';
     int qq = (backward) ? 1 : 0;
@@ -116,82 +117,75 @@ public class Compile {
       LineNumberReader in;
       // System.out.println("[" + args[i] + "]");
       Diff diff = new Diff();
-      try {
-        int stems = 0;
-        int words = 0;
-        
-        allocTrie();
-        
-        System.out.println(args[i]);
-        in = new LineNumberReader(new BufferedReader(new InputStreamReader(
-            new FileInputStream(args[i]), charset)));
-        for (String line = in.readLine(); line != null; line = in.readLine()) {
-          try {
-            line = line.toLowerCase();
-            StringTokenizer st = new StringTokenizer(line);
-            String stem = st.nextToken();
-            if (storeorig) {
-              trie.add(stem, "-a");
+      int stems = 0;
+      int words = 0;
+      
+      allocTrie();
+      
+      System.out.println(args[i]);
+      in = new LineNumberReader(new BufferedReader(new InputStreamReader(
+          new FileInputStream(args[i]), charset)));
+      for (String line = in.readLine(); line != null; line = in.readLine()) {
+        try {
+          line = line.toLowerCase(Locale.ROOT);
+          StringTokenizer st = new StringTokenizer(line);
+          String stem = st.nextToken();
+          if (storeorig) {
+            trie.add(stem, "-a");
+            words++;
+          }
+          while (st.hasMoreTokens()) {
+            String token = st.nextToken();
+            if (token.equals(stem) == false) {
+              trie.add(token, diff.exec(token, stem));
               words++;
             }
-            while (st.hasMoreTokens()) {
-              String token = st.nextToken();
-              if (token.equals(stem) == false) {
-                trie.add(token, diff.exec(token, stem));
-                words++;
-              }
-            }
-          } catch (java.util.NoSuchElementException x) {
-            // no base token (stem) on a line
           }
+        } catch (java.util.NoSuchElementException x) {
+          // no base token (stem) on a line
         }
-        
-        Optimizer o = new Optimizer();
-        Optimizer2 o2 = new Optimizer2();
-        Lift l = new Lift(true);
-        Lift e = new Lift(false);
-        Gener g = new Gener();
-        
-        for (int j = 0; j < optimizer.length; j++) {
-          String prefix;
-          switch (optimizer[j]) {
-            case 'G':
-              trie = trie.reduce(g);
-              prefix = "G: ";
-              break;
-            case 'L':
-              trie = trie.reduce(l);
-              prefix = "L: ";
-              break;
-            case 'E':
-              trie = trie.reduce(e);
-              prefix = "E: ";
-              break;
-            case '2':
-              trie = trie.reduce(o2);
-              prefix = "2: ";
-              break;
-            case '1':
-              trie = trie.reduce(o);
-              prefix = "1: ";
-              break;
-            default:
-              continue;
-          }
-          trie.printInfo(prefix + " ");
+      }
+      
+      Optimizer o = new Optimizer();
+      Optimizer2 o2 = new Optimizer2();
+      Lift l = new Lift(true);
+      Lift e = new Lift(false);
+      Gener g = new Gener();
+      
+      for (int j = 0; j < optimizer.length; j++) {
+        String prefix;
+        switch (optimizer[j]) {
+          case 'G':
+            trie = trie.reduce(g);
+            prefix = "G: ";
+            break;
+          case 'L':
+            trie = trie.reduce(l);
+            prefix = "L: ";
+            break;
+          case 'E':
+            trie = trie.reduce(e);
+            prefix = "E: ";
+            break;
+          case '2':
+            trie = trie.reduce(o2);
+            prefix = "2: ";
+            break;
+          case '1':
+            trie = trie.reduce(o);
+            prefix = "1: ";
+            break;
+          default:
+            continue;
         }
-               
-        DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
-            new FileOutputStream(args[i] + ".out")));
-        os.writeUTF(args[0]);
-        trie.store(os);
-        os.close();
-        
-      } catch (FileNotFoundException x) {
-        x.printStackTrace();
-      } catch (IOException x) {
-        x.printStackTrace();
+        trie.printInfo(System.out, prefix + " ");
       }
+      
+      DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
+          new FileOutputStream(args[i] + ".out")));
+      os.writeUTF(args[0]);
+      trie.store(os);
+      os.close();
     }
   }
   

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java Thu Jul 19 15:58:54 2012
@@ -55,9 +55,10 @@
 package org.egothor.stemmer;
 
 import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
 import java.io.LineNumberReader;
+import java.util.Locale;
 import java.util.StringTokenizer;
 
 /**
@@ -83,7 +84,7 @@ public class DiffIt {
    * 
    * @param args the path to a file containing a stemmer table
    */
-  public static void main(java.lang.String[] args) {
+  public static void main(java.lang.String[] args) throws Exception {
     
     int ins = get(0, args[0]);
     int del = get(1, args[0]);
@@ -94,27 +95,23 @@ public class DiffIt {
       LineNumberReader in;
       // System.out.println("[" + args[i] + "]");
       Diff diff = new Diff(ins, del, rep, nop);
-      try {
-        in = new LineNumberReader(new BufferedReader(new FileReader(args[i])));
-        for (String line = in.readLine(); line != null; line = in.readLine()) {
-          try {
-            line = line.toLowerCase();
-            StringTokenizer st = new StringTokenizer(line);
-            String stem = st.nextToken();
-            System.out.println(stem + " -a");
-            while (st.hasMoreTokens()) {
-              String token = st.nextToken();
-              if (token.equals(stem) == false) {
-                System.out.println(stem + " " + diff.exec(token, stem));
-              }
+      String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
+      in = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[i]), charset)));
+      for (String line = in.readLine(); line != null; line = in.readLine()) {
+        try {
+          line = line.toLowerCase(Locale.ROOT);
+          StringTokenizer st = new StringTokenizer(line);
+          String stem = st.nextToken();
+          System.out.println(stem + " -a");
+          while (st.hasMoreTokens()) {
+            String token = st.nextToken();
+            if (token.equals(stem) == false) {
+              System.out.println(stem + " " + diff.exec(token, stem));
             }
-          } catch (java.util.NoSuchElementException x) {
-            // no base token (stem) on a line
           }
+        } catch (java.util.NoSuchElementException x) {
+          // no base token (stem) on a line
         }
-        
-      } catch (IOException x) {
-        x.printStackTrace();
       }
     }
   }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java Thu Jul 19 15:58:54 2012
@@ -57,6 +57,7 @@ package org.egothor.stemmer;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -200,9 +201,9 @@ public class MultiTrie extends Trie {
    * @param prefix the desired prefix
    */
   @Override
-  public void printInfo(CharSequence prefix) {
+  public void printInfo(PrintStream out, CharSequence prefix) {
     int c = 0;
     for (Trie trie : tries)
-      trie.printInfo(prefix + "[" + (++c) + "] ");
+      trie.printInfo(out, prefix + "[" + (++c) + "] ");
   }
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java Thu Jul 19 15:58:54 2012
@@ -57,6 +57,7 @@ package org.egothor.stemmer;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.io.PrintStream;
 import java.util.Iterator;
 import java.util.TreeMap;
 
@@ -292,15 +293,15 @@ public class Row {
   }
   
   /**
-   * Write the contents of this Row to stdout.
+   * Write the contents of this Row to the printstream.
    */
-  public void print() {
+  public void print(PrintStream out) {
     for (Iterator<Character> i = cells.keySet().iterator(); i.hasNext();) {
       Character ch = i.next();
       Cell c = at(ch);
-      System.out.print("[" + ch + ":" + c + "]");
+      out.print("[" + ch + ":" + c + "]");
     }
-    System.out.println();
+    out.println();
   }
   
   Cell at(Character index) {

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java Thu Jul 19 15:58:54 2012
@@ -57,6 +57,7 @@ package org.egothor.stemmer;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -375,8 +376,8 @@ public class Trie {
     return by.optimize(this);
   }
   
-  public void printInfo(CharSequence prefix) {
-    System.out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
+  public void printInfo(PrintStream out, CharSequence prefix) {
+    out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
         + " cells " + getCells() + " valcells " + getCellsVal() + " pntcells "
         + getCellsPnt());
   }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Thu Jul 19 15:58:54 2012
@@ -51,6 +51,6 @@ public class TestPolishAnalyzer extends 
   
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
-    checkRandomData(random(), new PolishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random(), new PolishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
   }
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java Thu Jul 19 15:58:54 2012
@@ -60,12 +60,14 @@ import java.io.BufferedReader;
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.net.URI;
+import java.util.Locale;
 import java.util.StringTokenizer;
 
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 
 public class TestCompile extends LuceneTestCase {
@@ -107,7 +109,7 @@ public class TestCompile extends LuceneT
     Trie trie;
     DataInputStream is = new DataInputStream(new BufferedInputStream(
         new FileInputStream(path)));
-    String method = is.readUTF().toUpperCase();
+    String method = is.readUTF().toUpperCase(Locale.ROOT);
     if (method.indexOf('M') < 0) {
       trie = new Trie(is);
     } else {
@@ -120,11 +122,11 @@ public class TestCompile extends LuceneT
   private static void assertTrie(Trie trie, String file, boolean usefull,
       boolean storeorig) throws Exception {
     LineNumberReader in = new LineNumberReader(new BufferedReader(
-        new FileReader(file)));
+        new InputStreamReader(new FileInputStream(file), IOUtils.CHARSET_UTF_8)));
     
     for (String line = in.readLine(); line != null; line = in.readLine()) {
       try {
-        line = line.toLowerCase();
+        line = line.toLowerCase(Locale.ROOT);
         StringTokenizer st = new StringTokenizer(line);
         String stem = st.nextToken();
         if (storeorig) {
@@ -132,7 +134,7 @@ public class TestCompile extends LuceneT
               .getLastOnPath(stem);
           StringBuilder stm = new StringBuilder(stem);
           Diff.apply(stm, cmd);
-          assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+          assertEquals(stem.toLowerCase(Locale.ROOT), stm.toString().toLowerCase(Locale.ROOT));
         }
         while (st.hasMoreTokens()) {
           String token = st.nextToken();
@@ -143,7 +145,7 @@ public class TestCompile extends LuceneT
               .getLastOnPath(token);
           StringBuilder stm = new StringBuilder(token);
           Diff.apply(stm, cmd);
-          assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+          assertEquals(stem.toLowerCase(Locale.ROOT), stm.toString().toLowerCase(Locale.ROOT));
         }
       } catch (java.util.NoSuchElementException x) {
         // no base token (stem) on a line

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java Thu Jul 19 15:58:54 2012
@@ -80,8 +80,8 @@ public abstract class BaseUIMATokenizer 
   }
 
   @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
+  public void setReader(Reader input) throws IOException {
+    super.setReader(input);
     iterator = null;
   }
 

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java Thu Jul 19 15:58:54 2012
@@ -30,7 +30,6 @@ import org.apache.lucene.search.MatchAll
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -65,7 +64,7 @@ public class UIMABaseAnalyzerTest extend
   @Test
   public void baseUIMAAnalyzerIntegrationTest() throws Exception {
     Directory dir = new RAMDirectory();
-    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
+    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
     // add the first doc
     Document doc = new Document();
     String dummyTitle = "this is a dummy title ";
@@ -78,7 +77,7 @@ public class UIMABaseAnalyzerTest extend
     // try the search over the first doc
     DirectoryReader directoryReader = DirectoryReader.open(dir);
     IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
-    TopDocs result = indexSearcher.search(new MatchAllDocsQuery(), 10);
+    TopDocs result = indexSearcher.search(new MatchAllDocsQuery(), 1);
     assertTrue(result.totalHits > 0);
     Document d = indexSearcher.doc(result.scoreDocs[0].doc);
     assertNotNull(d);
@@ -99,7 +98,7 @@ public class UIMABaseAnalyzerTest extend
     directoryReader.close();
     directoryReader = DirectoryReader.open(dir);
     indexSearcher = new IndexSearcher(directoryReader);
-    result = indexSearcher.search(new MatchAllDocsQuery(), 10);
+    result = indexSearcher.search(new MatchAllDocsQuery(), 2);
     Document d1 = indexSearcher.doc(result.scoreDocs[1].doc);
     assertNotNull(d1);
     assertNotNull(d1.getField("title"));
@@ -109,7 +108,7 @@ public class UIMABaseAnalyzerTest extend
 
     // do a matchalldocs query to retrieve both docs
     indexSearcher = new IndexSearcher(directoryReader);
-    result = indexSearcher.search(new MatchAllDocsQuery(), 10);
+    result = indexSearcher.search(new MatchAllDocsQuery(), 2);
     assertEquals(2, result.totalHits);
     writer.close();
     indexSearcher.getIndexReader().close();
@@ -119,7 +118,7 @@ public class UIMABaseAnalyzerTest extend
   @Test
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
-        1000 * RANDOM_MULTIPLIER);
+        100 * RANDOM_MULTIPLIER);
   }
 
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java Thu Jul 19 15:58:54 2012
@@ -61,7 +61,7 @@ public class UIMATypeAwareAnalyzerTest e
   @Test
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
-        "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 1000 * RANDOM_MULTIPLIER);
+        "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 100 * RANDOM_MULTIPLIER);
   }
 
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml Thu Jul 19 15:58:54 2012
@@ -155,6 +155,7 @@
     	<fileset dir="lib">
     	  <include name="commons-compress-1.2.jar"/>
     	  <include name="xercesImpl-2.9.1.jar"/>
+    	  <include name="nekohtml-1.9.15.jar"/>
     	</fileset>
     </path>
     <path id="run.classpath">
@@ -261,18 +262,6 @@
 
     <target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
   
-    <target name="clean-javacc">
-      <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
-	<containsregexp expression="Generated.*By.*JavaCC"/>
-      </fileset>
-    </target>
-    
-    <target name="javacc" depends="init,javacc-check" if="javacc.present">
-      <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
-                     outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
-		     />
-    </target>
-    
     <target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
     <target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
       <copy todir="${build.dir}/classes/test/conf">

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml Thu Jul 19 15:58:54 2012
@@ -21,6 +21,7 @@
     <dependencies>
       <dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
       <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
+      <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
       <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
     </dependencies>
 </ivy-module>

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java Thu Jul 19 15:58:54 2012
@@ -23,6 +23,7 @@ import java.io.Reader;
 
 import org.apache.lucene.benchmark.byTask.utils.Algorithm;
 import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.util.IOUtils;
 
 
 /**
@@ -106,7 +107,7 @@ public class Benchmark {
     
     Benchmark benchmark = null;
     try {
-      benchmark = new Benchmark(new FileReader(algFile));
+      benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
     } catch (Exception e) {
       e.printStackTrace();
       System.exit(1);

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Thu Jul 19 15:58:54 2012
@@ -19,51 +19,203 @@ package org.apache.lucene.benchmark.byTa
 
 import java.io.IOException;
 import java.io.Reader;
-import java.text.DateFormat;
-import java.text.ParseException;
+import java.io.StringReader;
+import java.util.Collections;
 import java.util.Date;
+import java.util.HashSet;
+import java.util.Locale;
 import java.util.Properties;
+import java.util.Set;
+
+import org.cyberneko.html.parsers.SAXParser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * HTML Parser that is based on Lucene's demo HTML parser.
+ * Simple HTML Parser extracting title, meta tags, and body text
+ * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
  */
-public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
+public class DemoHTMLParser implements HTMLParser {
+  
+  /** The actual parser to read HTML documents */
+  public static final class Parser {
+    
+    public final Properties metaTags = new Properties();
+    public final String title, body;
+    
+    public Parser(Reader reader) throws IOException, SAXException {
+      this(new InputSource(reader));
+    }
+    
+    public Parser(InputSource source) throws IOException, SAXException {
+      final SAXParser parser = new SAXParser();
+      parser.setFeature("http://xml.org/sax/features/namespaces", true);
+      parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
+      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
 
-  public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
-    org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader);
+      final StringBuilder title = new StringBuilder(), body = new StringBuilder();
+      final DefaultHandler handler = new DefaultHandler() {
+        private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
+
+        @Override
+        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
+          if (inHEAD > 0) {
+            if (equalsIgnoreTurkish("title", localName)) {
+              inTITLE++;
+            } else {
+              if (equalsIgnoreTurkish("meta", localName)) {
+                String name = atts.getValue("name");
+                if (name == null) {
+                  name = atts.getValue("http-equiv");
+                }
+                final String val = atts.getValue("content");
+                if (name != null && val != null) {
+                  metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
+                }
+              }
+            }
+          } else if (inBODY > 0) {
+            if (SUPPRESS_ELEMENTS.contains(localName)) {
+              suppressed++;
+            } else if (equalsIgnoreTurkish("img", localName)) {
+              // the original javacc-based parser preserved <IMG alt="..."/>
+              // attribute as body text in [] parenthesis:
+              final String alt = atts.getValue("alt");
+              if (alt != null) {
+                body.append('[').append(alt).append(']');
+              }
+            }
+          } else if (equalsIgnoreTurkish("body", localName)) {
+            inBODY++;
+          } else if (equalsIgnoreTurkish("head", localName)) {
+            inHEAD++;
+          } else if (equalsIgnoreTurkish("frameset", localName)) {
+            throw new SAXException("This parser does not support HTML framesets.");
+          }
+        }
+
+        @Override
+        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
+          if (inBODY > 0) {
+            if (equalsIgnoreTurkish("body", localName)) {
+              inBODY--;
+            } else if (ENDLINE_ELEMENTS.contains(localName)) {
+              body.append('\n');
+            } else if (SUPPRESS_ELEMENTS.contains(localName)) {
+              suppressed--;
+            }
+          } else if (inHEAD > 0) {
+            if (equalsIgnoreTurkish("head", localName)) {
+              inHEAD--;
+            } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
+              inTITLE--;
+            }
+          }
+        }
+        
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException { 
+          if (inBODY > 0 && suppressed == 0) {
+            body.append(ch, start, length);
+          } else if (inTITLE > 0) {
+            title.append(ch, start, length);
+          }
+        }
+
+        @Override
+        public InputSource resolveEntity(String publicId, String systemId) {
+          // disable network access caused by DTDs
+          return new InputSource(new StringReader(""));
+        }
+      };
+      
+      parser.setContentHandler(handler);
+      parser.setErrorHandler(handler);
+      parser.parse(source);
+      
+      // the javacc-based parser trimmed title (which should be done for HTML in all cases):
+      this.title = title.toString().trim();
+      
+      // assign body text
+      this.body = body.toString();
+    }
+    
+    // TODO: remove the Turkish workaround once this is fixed in NekoHTML:
+    // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
     
-    // title
-    if (title==null) {
-      title = p.getTitle();
+    // BEGIN: workaround
+    static final String convertTurkish(String s) {
+      return s.replace('i', 'ı');
     }
     
-    // properties 
-    Properties props = p.getMetaTags(); 
-    // body
-    Reader r = p.getReader();
-    char c[] = new char[1024];
-    StringBuilder bodyBuf = new StringBuilder();
-    int n;
-    while ((n = r.read(c)) >= 0) {
-      if (n>0) {
-        bodyBuf.append(c,0,n);
+    static final boolean equalsIgnoreTurkish(String s1, String s2) {
+      final int len1 = s1.length(), len2 = s2.length();
+      if (len1 != len2)
+        return false;
+      for (int i = 0; i < len1; i++) {
+        char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
+        if (ch1 == 'ı') ch1 = 'i';
+        if (ch2 == 'ı') ch2 = 'i';
+        if (ch1 != ch2)
+          return false;
       }
+      return true;
     }
-    r.close();
-    if (date == null && props.getProperty("date")!=null) {
-      try {
-        date = dateFormat.parse(props.getProperty("date").trim());
-      } catch (ParseException e) {
-        // do not fail test just because a date could not be parsed
-        System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
-        date = new Date(); // now 
+    // END: workaround
+    
+    static final Set<String> createElementNameSet(String... names) {
+      final HashSet<String> set = new HashSet<String>();
+      for (final String name : names) {
+        set.add(name);
+        set.add(convertTurkish(name));
+      }
+      return Collections.unmodifiableSet(set);
+    }
+    
+    /** HTML elements that cause a line break (they are block-elements) */
+    static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
+      "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+      "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+      "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
+    );
+
+    /** HTML elements with contents that are ignored */
+    static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
+      "style", "script"
+    );
+  }
+
+  @Override
+  public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
+    try {
+      return parse(docData, name, date, new InputSource(reader), trecSrc);
+    } catch (SAXException saxe) {
+      throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
+    }
+  }
+  
+  public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
+    final Parser p = new Parser(source);
+    
+    // properties 
+    final Properties props = p.metaTags;
+    String dateStr = props.getProperty("date");
+    if (dateStr != null) {
+      final Date newDate = trecSrc.parseDate(dateStr);
+      if (newDate != null) {
+        date = newDate;
       }
     }
     
     docData.clear();
     docData.setName(name);
-    docData.setBody(bodyBuf.toString());
-    docData.setTitle(title);
+    docData.setBody(p.body);
+    docData.setTitle(p.title);
     docData.setProps(props);
     docData.setDate(date);
     return docData;

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java Thu Jul 19 15:58:54 2012
@@ -18,12 +18,14 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.util.IOUtils;
 
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileFilter;
-import java.io.FileReader;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.text.DateFormat;
 import java.text.ParsePosition;
 import java.text.SimpleDateFormat;
@@ -161,7 +163,7 @@ public class DirContentSource extends Co
       dfi = new DateFormatInfo();
       dfi.pos = new ParsePosition(0);
       // date format: 30-MAR-1987 14:22:36.87
-      dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.US);
+      dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.ROOT);
       dfi.df.setLenient(true);
       dateFormat.set(dfi);
     }
@@ -198,7 +200,7 @@ public class DirContentSource extends Co
       name = f.getCanonicalPath()+"_"+iteration;
     }
     
-    BufferedReader reader = new BufferedReader(new FileReader(f));
+    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
     String line = null;
     //First line is the date, 3rd is the title, rest is body
     String dateStr = reader.readLine();

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java Thu Jul 19 15:58:54 2012
@@ -29,6 +29,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Random;
+import java.util.TimeZone;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
@@ -182,8 +183,8 @@ public class DocMaker implements Closeab
   private boolean storeBytes = false;
 
   private static class DateUtil {
-    public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
-    public Calendar cal = Calendar.getInstance();
+    public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ROOT);
+    public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
     public ParsePosition pos = new ParsePosition(0);
     public DateUtil() {
       parser.setLenient(true);

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Jul 19 15:58:54 2012
@@ -25,6 +25,7 @@ import java.io.InputStreamReader;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
@@ -146,7 +147,7 @@ public class EnwikiContentSource extends
         case BODY:
           body = contents.toString();
           //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
-          String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
+          String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
           if (startsWith.startsWith("#redirect")) {
             body = null;
           }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java Thu Jul 19 15:58:54 2012
@@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.cla
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 import java.io.*;
@@ -59,13 +60,14 @@ public class FileBasedQueryMaker extends
     {
       File file = new File(fileName);
       Reader reader = null;
+      // note: we use a decoding reader, so if your queries are screwed up you know
       if (file.exists()) {
-        reader = new FileReader(file);
+        reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
       } else {
         //see if we can find it as a resource
         InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
         if (asStream != null) {
-          reader = new InputStreamReader(asStream);
+          reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
         }
       }
       if (reader != null) {

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Thu Jul 19 15:58:54 2012
@@ -19,7 +19,6 @@ package org.apache.lucene.benchmark.byTa
 
 import java.io.IOException;
 import java.io.Reader;
-import java.text.DateFormat;
 import java.util.Date;
 
 /**
@@ -34,13 +33,11 @@ public interface HTMLParser {
    * @param docData result reused
    * @param name name of the result doc data.
    * @param date date of the result doc data. If null, attempt to set by parsed data.
-   * @param title title of the result doc data. If null, attempt to set by parsed data.
    * @param reader reader of html text to parse.
-   * @param dateFormat date formatter to use for extracting the date.   
+   * @param trecSrc the {@link TrecContentSource} used to parse dates.   
    * @return Parsed doc data.
    * @throws IOException
-   * @throws InterruptedException
    */
-  public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+  public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
 
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java Thu Jul 19 15:58:54 2012
@@ -35,7 +35,7 @@ public class LongToEnglishContentSource 
   }
 
   // TODO: we could take param to specify locale...
-  private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ENGLISH,
+  private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
                                                                        RuleBasedNumberFormat.SPELLOUT);
   @Override
   public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java Thu Jul 19 15:58:54 2012
@@ -37,7 +37,7 @@ public class LongToEnglishQueryMaker imp
   protected QueryParser parser;
 
   // TODO: we could take param to specify locale...
-  private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ENGLISH,
+  private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
                                                                        RuleBasedNumberFormat.SPELLOUT);
 
   public Query makeQuery(int size) throws Exception {

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java Thu Jul 19 15:58:54 2012
@@ -19,8 +19,9 @@ package org.apache.lucene.benchmark.byTa
 
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.text.DateFormat;
 import java.text.ParsePosition;
 import java.text.SimpleDateFormat;
@@ -29,6 +30,7 @@ import java.util.Date;
 import java.util.Locale;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.util.IOUtils;
 
 /**
  * A {@link ContentSource} reading from the Reuters collection.
@@ -74,7 +76,7 @@ public class ReutersContentSource extend
     if (dfi == null) {
       dfi = new DateFormatInfo();
       // date format: 30-MAR-1987 14:22:36.87
-      dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
+      dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.ROOT);
       dfi.df.setLenient(true);
       dfi.pos = new ParsePosition(0);
       dateFormat.set(dfi);
@@ -112,7 +114,7 @@ public class ReutersContentSource extend
       name = f.getCanonicalPath() + "_" + iteration;
     }
 
-    BufferedReader reader = new BufferedReader(new FileReader(f));
+    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
     try {
       // First line is the date, 3rd is the title, rest is body
       String dateStr = reader.readLine();

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Thu Jul 19 15:58:54 2012
@@ -22,7 +22,6 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.Reader;
 import java.text.DateFormat;
 import java.text.ParsePosition;
 import java.text.SimpleDateFormat;
@@ -33,8 +32,6 @@ import java.util.Locale;
 import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
-import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
-import org.apache.lucene.util.ThreadInterruptedException;
 
 /**
  * Implements a {@link ContentSource} over the TREC collection.
@@ -57,7 +54,7 @@ import org.apache.lucene.util.ThreadInte
  */
 public class TrecContentSource extends ContentSource {
 
-  private static final class DateFormatInfo {
+  static final class DateFormatInfo {
     DateFormat[] dfs;
     ParsePosition pos;
   }
@@ -83,13 +80,10 @@ public class TrecContentSource extends C
   };
 
   private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
-  private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
   private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
   private File dataDir = null;
   private ArrayList<File> inputFiles = new ArrayList<File>();
   private int nextFile = 0;
-  private int rawDocSize = 0;
-
   // Use to synchronize threads on reading from the TREC documents.
   private Object lock = new Object();
 
@@ -108,7 +102,7 @@ public class TrecContentSource extends C
       dfi = new DateFormatInfo();
       dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length];
       for (int i = 0; i < dfi.dfs.length; i++) {
-        dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US);
+        dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.ROOT);
         dfi.dfs[i].setLenient(true);
       }
       dfi.pos = new ParsePosition(0);
@@ -126,17 +120,6 @@ public class TrecContentSource extends C
     return sb;
   }
   
-  Reader getTrecDocReader(StringBuilder docBuffer) {
-    StringBuilderReader r = trecDocReader.get();
-    if (r == null) {
-      r = new StringBuilderReader(docBuffer);
-      trecDocReader.set(r);
-    } else {
-      r.set(docBuffer);
-    }
-    return r;
-  }
-
   HTMLParser getHtmlParser() {
     return htmlParser;
   }
@@ -161,7 +144,7 @@ public class TrecContentSource extends C
         continue;
       }
 
-      rawDocSize += line.length();
+      line.length();
 
       if (lineStart!=null && line.startsWith(lineStart)) {
         if (collectMatchLine) {
@@ -287,12 +270,8 @@ public class TrecContentSource extends C
 
     // This code segment relies on HtmlParser being thread safe. When we get 
     // here, everything else is already private to that thread, so we're safe.
-    try {
-      docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
-      addItem();
-    } catch (InterruptedException ie) {
-      throw new ThreadInterruptedException(ie);
-    }
+    docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
+    addItem();
 
     return docData;
   }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Thu Jul 19 15:58:54 2012
@@ -47,7 +47,7 @@ public abstract class TrecDocParser {
   static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
   static {
     for (ParsePathType ppt : ParsePathType.values()) {
-      pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
+      pathName2Type.put(ppt.name().toUpperCase(Locale.ROOT),ppt);
     }
   }
   
@@ -60,7 +60,7 @@ public abstract class TrecDocParser {
   public static ParsePathType pathType(File f) {
     int pathLength = 0;
     while (f != null && ++pathLength < MAX_PATH_LENGTH) {
-      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
+      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ROOT));
       if (ppt!=null) {
         return ppt;
       }
@@ -80,7 +80,7 @@ public abstract class TrecDocParser {
    * parsers to alter their behavior according to the file path type. 
    */  
   public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
+      StringBuilder docBuf, ParsePathType pathType) throws IOException;
   
   /** 
    * strip tags from <code>buf</code>: each tag is replaced by a single blank.

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java Thu Jul 19 15:58:54 2012
@@ -37,7 +37,7 @@ public class TrecFBISParser extends Trec
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
     // optionally skip some of the text, set date, title
     Date date = null;

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java Thu Jul 19 15:58:54 2012
@@ -41,7 +41,7 @@ public class TrecFR94Parser extends Trec
   
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
     // optionally skip some of the text, set date (no title?)
     Date date = null;

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java Thu Jul 19 15:58:54 2012
@@ -33,7 +33,7 @@ public class TrecFTParser extends TrecDo
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
 
     // date...

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java Thu Jul 19 15:58:54 2012
@@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import java.io.IOException;
-import java.io.Reader;
+import java.io.StringReader;
 import java.util.Date;
 
 /**
@@ -31,29 +31,24 @@ public class TrecGov2Parser extends Trec
   
   private static final String DOCHDR = "<DOCHDR>";
   private static final String TERMINATING_DOCHDR = "</DOCHDR>";
-  private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    // Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
-    Reader r = trecSrc.getTrecDocReader(docBuf);
-
-    // skip some of the text, optionally set date
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
+    // skip some of the non-html text, optionally set date
     Date date = null;
-    int h1 = docBuf.indexOf(DOCHDR);
-    if (h1>=0) {
-      int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
-      String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
+    int start = 0;
+    final int h1 = docBuf.indexOf(DOCHDR);
+    if (h1 >= 0) {
+      final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
+      final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
       if (dateStr != null) {
         date = trecSrc.parseDate(dateStr);
       }
-      r.mark(h2+TERMINATING_DOCHDR_LENGTH);
+      start = h2 + TERMINATING_DOCHDR.length();
     }
-
-    r.reset();
-    HTMLParser htmlParser = trecSrc.getHtmlParser();
-    return htmlParser.parse(docData, name, date, null, r, null);
+    final String html = docBuf.substring(start);
+    return trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
   }
   
 }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java Thu Jul 19 15:58:54 2012
@@ -36,7 +36,7 @@ public class TrecLATimesParser extends T
   
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
 
     // date...

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java Thu Jul 19 15:58:54 2012
@@ -26,7 +26,7 @@ public class TrecParserByPath extends Tr
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
   }
 

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java Thu Jul 19 15:58:54 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import java.text.NumberFormat;
+import java.util.Locale;
 
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@@ -61,7 +62,7 @@ public class AddDocTask extends PerfTask
 
   @Override
   protected String getLogMessage(int recsCount) {
-    return String.format("added %9d docs",recsCount);
+    return String.format(Locale.ROOT, "added %9d docs",recsCount);
   }
   
   @Override

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Thu Jul 19 15:58:54 2012
@@ -20,7 +20,6 @@ package org.apache.lucene.benchmark.byTa
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexCommit;
 import org.apache.lucene.index.IndexDeletionPolicy;
 import org.apache.lucene.index.IndexWriter;
@@ -34,7 +33,6 @@ import org.apache.lucene.index.NoDeletio
 import org.apache.lucene.index.NoMergePolicy;
 import org.apache.lucene.index.NoMergeScheduler;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.util.Version;
 
 import java.io.BufferedOutputStream;
@@ -42,6 +40,7 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.PrintStream;
+import java.nio.charset.Charset;
 
 /**
  * Create an index. <br>
@@ -174,7 +173,7 @@ public class CreateIndexTask extends Per
     return iwConf;
   }
   
-  public static IndexWriter configureWriter(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException {
+  public static IndexWriter configureWriter(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) throws IOException {
     IndexWriterConfig iwc = createWriterConfig(config, runData, mode, commit);
     String infoStreamVal = config.get("writer.info.stream", null);
     if (infoStreamVal != null) {
@@ -184,7 +183,7 @@ public class CreateIndexTask extends Per
         iwc.setInfoStream(System.err);
       } else {
         File f = new File(infoStreamVal).getAbsoluteFile();
-        iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f))));
+        iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)), false, Charset.defaultCharset().name()));
       }
     }
     IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Thu Jul 19 15:58:54 2012
@@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTa
  * limitations under the License.
  */
 
+import java.util.Locale;
+
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.stats.Points;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
@@ -266,7 +268,7 @@ public abstract class PerfTask implement
   public void tearDown() throws Exception {
     if (++logStepCount % logStep == 0) {
       double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
-      System.out.println(String.format("%7.2f",time) + " sec --> "
+      System.out.println(String.format(Locale.ROOT, "%7.2f",time) + " sec --> "
           + Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
     }
   }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java Thu Jul 19 15:58:54 2012
@@ -77,7 +77,7 @@ public class SearchWithSortTask extends 
         } else {
           throw new RuntimeException("You must specify the sort type ie page:int,subject:string");
         }
-        sortField0 = new SortField(fieldName, SortField.Type.valueOf(typeString.toUpperCase(Locale.ENGLISH)));
+        sortField0 = new SortField(fieldName, SortField.Type.valueOf(typeString.toUpperCase(Locale.ROOT)));
       }
       sortFields[upto++] = sortField0;
     }

Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java Thu Jul 19 15:58:54 2012
@@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTa
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
 import java.text.NumberFormat;
 
 import org.apache.lucene.benchmark.byTask.PerfRunData;
@@ -428,7 +429,7 @@ public class TaskSequence extends PerfTa
     sb.append(padd);
     sb.append(!letChildReport ? ">" : (parallel ? "]" : "}"));
     if (fixedTime) {
-      sb.append(" " + NumberFormat.getNumberInstance().format(runTimeSec) + "s");
+      sb.append(" " + NumberFormat.getNumberInstance(Locale.ROOT).format(runTimeSec) + "s");
     } else if (repetitions>1) {
       sb.append(" * " + repetitions);
     } else if (repetitions==REPEAT_EXHAUST) {
@@ -487,7 +488,7 @@ public class TaskSequence extends PerfTa
     if (rate>0) {
       seqName += "_" + rate + (perMin?"/min":"/sec"); 
     }
-    if (parallel && seqName.toLowerCase().indexOf("par")<0) {
+    if (parallel && seqName.toLowerCase(Locale.ROOT).indexOf("par")<0) {
       seqName += "_Par";
     }
   }