You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/07/15 19:34:48 UTC

svn commit: r1361741 - in /lucene/dev/trunk: dev-tools/eclipse/ lucene/ lucene/benchmark/ lucene/benchmark/lib/ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demoh...

Author: uschindler
Date: Sun Jul 15 17:34:47 2012
New Revision: 1361741

URL: http://svn.apache.org/viewvc?rev=1361741&view=rev
Log:
LUCENE-4220: Remove the buggy JavaCC-based HTML parser in the benchmark module and replaced by NekoHTML

Added:
    lucene/dev/trunk/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1
    lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java   (with props)
Removed:
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java
    lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/
Modified:
    lucene/dev/trunk/dev-tools/eclipse/dot.classpath
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/benchmark/build.xml
    lucene/dev/trunk/lucene/benchmark/ivy.xml
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
    lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
    lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java

Modified: lucene/dev/trunk/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/eclipse/dot.classpath?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/trunk/dev-tools/eclipse/dot.classpath Sun Jul 15 17:34:47 2012
@@ -102,6 +102,7 @@
 	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
 	<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
 	<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
+	<classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
 	<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Jul 15 17:34:47 2012
@@ -29,6 +29,10 @@ API Changes
   make a custom FieldType and set indexed = true, its analyzed by the analyzer.
   (Robert Muir)
 
+* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
+  module and replaced by NekoHTML. HTMLParser interface was cleaned up while
+  changing method signatures.  (Uwe Schindler, Robert Muir)
+
 Optimizations
 
 * LUCENE-4171: Performance improvements to Packed64.

Modified: lucene/dev/trunk/lucene/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/build.xml?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/build.xml (original)
+++ lucene/dev/trunk/lucene/benchmark/build.xml Sun Jul 15 17:34:47 2012
@@ -155,6 +155,7 @@
     	<fileset dir="lib">
     	  <include name="commons-compress-1.2.jar"/>
     	  <include name="xercesImpl-2.9.1.jar"/>
+    	  <include name="nekohtml-1.9.15.jar"/>
     	</fileset>
     </path>
     <path id="run.classpath">
@@ -261,20 +262,6 @@
 
     <target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
   
-    <target name="clean-javacc">
-      <delete>
-        <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
-    <containsregexp expression="Generated.*By.*JavaCC"/>
-        </fileset>
-      </delete>
-    </target>
-    
-    <target name="javacc" depends="init,javacc-check" if="javacc.present">
-      <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
-                     outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
-		     />
-    </target>
-    
     <target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
     <target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
       <copy todir="${build.dir}/classes/test/conf">

Modified: lucene/dev/trunk/lucene/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/ivy.xml?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/ivy.xml (original)
+++ lucene/dev/trunk/lucene/benchmark/ivy.xml Sun Jul 15 17:34:47 2012
@@ -21,6 +21,7 @@
     <dependencies>
       <dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
       <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
+      <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
       <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
     </dependencies>
 </ivy-module>

Added: lucene/dev/trunk/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1?rev=1361741&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1 (added)
+++ lucene/dev/trunk/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1 Sun Jul 15 17:34:47 2012
@@ -0,0 +1 @@
+a45cd7b7401d9c2264d4908182380452c03ebf8f

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Sun Jul 15 17:34:47 2012
@@ -19,51 +19,203 @@ package org.apache.lucene.benchmark.byTa
 
 import java.io.IOException;
 import java.io.Reader;
-import java.text.DateFormat;
-import java.text.ParseException;
+import java.io.StringReader;
+import java.util.Collections;
 import java.util.Date;
+import java.util.HashSet;
+import java.util.Locale;
 import java.util.Properties;
+import java.util.Set;
+
+import org.cyberneko.html.parsers.SAXParser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * HTML Parser that is based on Lucene's demo HTML parser.
+ * Simple HTML Parser extracting title, meta tags, and body text
+ * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
  */
-public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
+public class DemoHTMLParser implements HTMLParser {
+  
+  /** The actual parser to read HTML documents */
+  public static final class Parser {
+    
+    public final Properties metaTags = new Properties();
+    public final String title, body;
+    
+    public Parser(Reader reader) throws IOException, SAXException {
+      this(new InputSource(reader));
+    }
+    
+    public Parser(InputSource source) throws IOException, SAXException {
+      final SAXParser parser = new SAXParser();
+      parser.setFeature("http://xml.org/sax/features/namespaces", true);
+      parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
+      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
 
-  public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
-    org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader);
+      final StringBuilder title = new StringBuilder(), body = new StringBuilder();
+      final DefaultHandler handler = new DefaultHandler() {
+        private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
+
+        @Override
+        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
+          if (inHEAD > 0) {
+            if (equalsIgnoreTurkish("title", localName)) {
+              inTITLE++;
+            } else {
+              if (equalsIgnoreTurkish("meta", localName)) {
+                String name = atts.getValue("name");
+                if (name == null) {
+                  name = atts.getValue("http-equiv");
+                }
+                final String val = atts.getValue("content");
+                if (name != null && val != null) {
+                  metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
+                }
+              }
+            }
+          } else if (inBODY > 0) {
+            if (SUPPRESS_ELEMENTS.contains(localName)) {
+              suppressed++;
+            } else if (equalsIgnoreTurkish("img", localName)) {
+              // the original javacc-based parser preserved <IMG alt="..."/>
+              // attribute as body text in [] parenthesis:
+              final String alt = atts.getValue("alt");
+              if (alt != null) {
+                body.append('[').append(alt).append(']');
+              }
+            }
+          } else if (equalsIgnoreTurkish("body", localName)) {
+            inBODY++;
+          } else if (equalsIgnoreTurkish("head", localName)) {
+            inHEAD++;
+          } else if (equalsIgnoreTurkish("frameset", localName)) {
+            throw new SAXException("This parser does not support HTML framesets.");
+          }
+        }
+
+        @Override
+        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
+          if (inBODY > 0) {
+            if (equalsIgnoreTurkish("body", localName)) {
+              inBODY--;
+            } else if (ENDLINE_ELEMENTS.contains(localName)) {
+              body.append('\n');
+            } else if (SUPPRESS_ELEMENTS.contains(localName)) {
+              suppressed--;
+            }
+          } else if (inHEAD > 0) {
+            if (equalsIgnoreTurkish("head", localName)) {
+              inHEAD--;
+            } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
+              inTITLE--;
+            }
+          }
+        }
+        
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException { 
+          if (inBODY > 0 && suppressed == 0) {
+            body.append(ch, start, length);
+          } else if (inTITLE > 0) {
+            title.append(ch, start, length);
+          }
+        }
+
+        @Override
+        public InputSource resolveEntity(String publicId, String systemId) {
+          // disable network access caused by DTDs
+          return new InputSource(new StringReader(""));
+        }
+      };
+      
+      parser.setContentHandler(handler);
+      parser.setErrorHandler(handler);
+      parser.parse(source);
+      
+      // the javacc-based parser trimmed title (which should be done for HTML in all cases):
+      this.title = title.toString().trim();
+      
+      // assign body text
+      this.body = body.toString();
+    }
+    
+    // TODO: remove the Turkish workaround once this is fixed in NekoHTML:
+    // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
     
-    // title
-    if (title==null) {
-      title = p.getTitle();
+    // BEGIN: workaround
+    static final String convertTurkish(String s) {
+      return s.replace('i', 'ı');
     }
     
-    // properties 
-    Properties props = p.getMetaTags(); 
-    // body
-    Reader r = p.getReader();
-    char c[] = new char[1024];
-    StringBuilder bodyBuf = new StringBuilder();
-    int n;
-    while ((n = r.read(c)) >= 0) {
-      if (n>0) {
-        bodyBuf.append(c,0,n);
+    static final boolean equalsIgnoreTurkish(String s1, String s2) {
+      final int len1 = s1.length(), len2 = s2.length();
+      if (len1 != len2)
+        return false;
+      for (int i = 0; i < len1; i++) {
+        char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
+        if (ch1 == 'ı') ch1 = 'i';
+        if (ch2 == 'ı') ch2 = 'i';
+        if (ch1 != ch2)
+          return false;
       }
+      return true;
     }
-    r.close();
-    if (date == null && props.getProperty("date")!=null) {
-      try {
-        date = dateFormat.parse(props.getProperty("date").trim());
-      } catch (ParseException e) {
-        // do not fail test just because a date could not be parsed
-        System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
-        date = new Date(); // now 
+    // END: workaround
+    
+    static final Set<String> createElementNameSet(String... names) {
+      final HashSet<String> set = new HashSet<String>();
+      for (final String name : names) {
+        set.add(name);
+        set.add(convertTurkish(name));
+      }
+      return Collections.unmodifiableSet(set);
+    }
+    
+    /** HTML elements that cause a line break (they are block-elements) */
+    static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
+      "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+      "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+      "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
+    );
+
+    /** HTML elements with contents that are ignored */
+    static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
+      "style", "script"
+    );
+  }
+
+  @Override
+  public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
+    try {
+      return parse(docData, name, date, new InputSource(reader), trecSrc);
+    } catch (SAXException saxe) {
+      throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
+    }
+  }
+  
+  public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
+    final Parser p = new Parser(source);
+    
+    // properties 
+    final Properties props = p.metaTags;
+    String dateStr = props.getProperty("date");
+    if (dateStr != null) {
+      final Date newDate = trecSrc.parseDate(dateStr);
+      if (newDate != null) {
+        date = newDate;
       }
     }
     
     docData.clear();
     docData.setName(name);
-    docData.setBody(bodyBuf.toString());
-    docData.setTitle(title);
+    docData.setBody(p.body);
+    docData.setTitle(p.title);
     docData.setProps(props);
     docData.setDate(date);
     return docData;

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Sun Jul 15 17:34:47 2012
@@ -19,7 +19,6 @@ package org.apache.lucene.benchmark.byTa
 
 import java.io.IOException;
 import java.io.Reader;
-import java.text.DateFormat;
 import java.util.Date;
 
 /**
@@ -34,13 +33,11 @@ public interface HTMLParser {
    * @param docData result reused
    * @param name name of the result doc data.
    * @param date date of the result doc data. If null, attempt to set by parsed data.
-   * @param title title of the result doc data. If null, attempt to set by parsed data.
    * @param reader reader of html text to parse.
-   * @param dateFormat date formatter to use for extracting the date.   
+   * @param trecSrc the {@link TrecContentSource} used to parse dates.   
    * @return Parsed doc data.
    * @throws IOException
-   * @throws InterruptedException
    */
-  public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+  public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
 
 }

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Sun Jul 15 17:34:47 2012
@@ -22,7 +22,6 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.Reader;
 import java.text.DateFormat;
 import java.text.ParsePosition;
 import java.text.SimpleDateFormat;
@@ -33,8 +32,6 @@ import java.util.Locale;
 import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
-import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
-import org.apache.lucene.util.ThreadInterruptedException;
 
 /**
  * Implements a {@link ContentSource} over the TREC collection.
@@ -57,7 +54,7 @@ import org.apache.lucene.util.ThreadInte
  */
 public class TrecContentSource extends ContentSource {
 
-  private static final class DateFormatInfo {
+  static final class DateFormatInfo {
     DateFormat[] dfs;
     ParsePosition pos;
   }
@@ -83,13 +80,10 @@ public class TrecContentSource extends C
   };
 
   private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
-  private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
   private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
   private File dataDir = null;
   private ArrayList<File> inputFiles = new ArrayList<File>();
   private int nextFile = 0;
-  private int rawDocSize = 0;
-
   // Use to synchronize threads on reading from the TREC documents.
   private Object lock = new Object();
 
@@ -126,17 +120,6 @@ public class TrecContentSource extends C
     return sb;
   }
   
-  Reader getTrecDocReader(StringBuilder docBuffer) {
-    StringBuilderReader r = trecDocReader.get();
-    if (r == null) {
-      r = new StringBuilderReader(docBuffer);
-      trecDocReader.set(r);
-    } else {
-      r.set(docBuffer);
-    }
-    return r;
-  }
-
   HTMLParser getHtmlParser() {
     return htmlParser;
   }
@@ -161,7 +144,7 @@ public class TrecContentSource extends C
         continue;
       }
 
-      rawDocSize += line.length();
+      line.length();
 
       if (lineStart!=null && line.startsWith(lineStart)) {
         if (collectMatchLine) {
@@ -287,12 +270,8 @@ public class TrecContentSource extends C
 
     // This code segment relies on HtmlParser being thread safe. When we get 
     // here, everything else is already private to that thread, so we're safe.
-    try {
-      docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
-      addItem();
-    } catch (InterruptedException ie) {
-      throw new ThreadInterruptedException(ie);
-    }
+    docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
+    addItem();
 
     return docData;
   }

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Sun Jul 15 17:34:47 2012
@@ -80,7 +80,7 @@ public abstract class TrecDocParser {
    * parsers to alter their behavior according to the file path type. 
    */  
   public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
+      StringBuilder docBuf, ParsePathType pathType) throws IOException;
   
   /** 
    * strip tags from <code>buf</code>: each tag is replaced by a single blank.

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java Sun Jul 15 17:34:47 2012
@@ -37,7 +37,7 @@ public class TrecFBISParser extends Trec
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
     // optionally skip some of the text, set date, title
     Date date = null;

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java Sun Jul 15 17:34:47 2012
@@ -41,7 +41,7 @@ public class TrecFR94Parser extends Trec
   
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
     // optionally skip some of the text, set date (no title?)
     Date date = null;

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java Sun Jul 15 17:34:47 2012
@@ -33,7 +33,7 @@ public class TrecFTParser extends TrecDo
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
 
     // date...

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java Sun Jul 15 17:34:47 2012
@@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import java.io.IOException;
-import java.io.Reader;
+import java.io.StringReader;
 import java.util.Date;
 
 /**
@@ -31,29 +31,24 @@ public class TrecGov2Parser extends Trec
   
   private static final String DOCHDR = "<DOCHDR>";
   private static final String TERMINATING_DOCHDR = "</DOCHDR>";
-  private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
-    // Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
-    Reader r = trecSrc.getTrecDocReader(docBuf);
-
-    // skip some of the text, optionally set date
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
+    // skip some of the non-html text, optionally set date
     Date date = null;
-    int h1 = docBuf.indexOf(DOCHDR);
-    if (h1>=0) {
-      int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
-      String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
+    int start = 0;
+    final int h1 = docBuf.indexOf(DOCHDR);
+    if (h1 >= 0) {
+      final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
+      final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
       if (dateStr != null) {
         date = trecSrc.parseDate(dateStr);
       }
-      r.mark(h2+TERMINATING_DOCHDR_LENGTH);
+      start = h2 + TERMINATING_DOCHDR.length();
     }
-
-    r.reset();
-    HTMLParser htmlParser = trecSrc.getHtmlParser();
-    return htmlParser.parse(docData, name, date, null, r, null);
+    final String html = docBuf.substring(start);
+    return trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
   }
   
 }

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java Sun Jul 15 17:34:47 2012
@@ -36,7 +36,7 @@ public class TrecLATimesParser extends T
   
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     int mark = 0; // that much is skipped
 
     // date...

Modified: lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java Sun Jul 15 17:34:47 2012
@@ -26,7 +26,7 @@ public class TrecParserByPath extends Tr
 
   @Override
   public DocData parse(DocData docData, String name, TrecContentSource trecSrc, 
-      StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+      StringBuilder docBuf, ParsePathType pathType) throws IOException {
     return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
   }
 

Added: lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java?rev=1361741&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java (added)
+++ lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java Sun Jul 15 17:34:47 2012
@@ -0,0 +1,142 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+import java.util.Locale;
+import java.util.Properties;
+
+import org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.Parser;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestHtmlParser extends LuceneTestCase {
+
+  public void testUnicode() throws Exception {
+    String text = "<html><body>汉语</body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("汉语", parser.body);
+  }
+  
+  public void testEntities() throws Exception {
+    String text = "<html><body>&#x6C49;&#x8BED;&yen;</body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("汉语¥", parser.body);
+  }
+  
+  public void testComments() throws Exception {
+    String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("foo", parser.body);
+  }
+  
+  public void testScript() throws Exception {
+    String text = "<html><body><script type=\"text/javascript\">" +
+                  "document.write(\"test\")</script>foo</body></html>"; 
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("foo", parser.body);
+  }
+  
+  public void testStyle() throws Exception {
+    String text = "<html><head><style type=\"text/css\">" +
+                  "body{background-color:blue;}</style>" +
+                  "</head><body>foo</body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("foo", parser.body);
+  }
+  
+  public void testDoctype() throws Exception {
+    String text = "<!DOCTYPE HTML PUBLIC " + 
+    "\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
+    "\"http://www.w3.org/TR/html4/loose.dtd\">" +
+    "<html><body>foo</body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("foo", parser.body);
+  }
+  
+  public void testMeta() throws Exception {
+    String text = "<html><head>" +
+    "<meta name=\"a\" content=\"1\" />" +
+    "<meta name=\"b\" content=\"2\" />" +
+    "<meta name=\"keywords\" content=\"this is a test\" />" +
+    "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
+    "</head><body>foobar</body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    Properties tags = parser.metaTags;
+    assertEquals(4, tags.size());
+    assertEquals("1", tags.get("a"));
+    assertEquals("2", tags.get("b"));
+    assertEquals("this is a test", tags.get("keywords"));
+    assertEquals("text/html;charset=UTF-8", tags.get("content-type"));
+  }
+  
+  public void testTitle() throws Exception {
+    String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("foo", parser.title);
+  }
+  
+  // LUCENE-2246
+  public void testTurkish() throws Exception {
+    final Locale saved = Locale.getDefault();
+    try {
+      Locale.setDefault(new Locale("tr", "TR"));
+      String text = "<html><HEAD><TITLE>ııı</TITLE></head><body>" +
+      "<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
+      "<a title=\"(ııı)\"></body></html>";
+      Parser parser = new Parser(new StringReader(text));
+      assertEquals("ııı", parser.title);
+      assertEquals("[ş]", parser.body);
+    } finally {
+      Locale.setDefault(saved);
+    }
+  }
+  
+  public void testSampleTRECDoc() throws Exception {
+    String text = "<html>\r\n" + 
+        "\r\n" + 
+        "<head>\r\n" + 
+        "<title>\r\n" + 
+        "TEST-000 title\r\n" + 
+        "</title>\r\n" + 
+        "</head>\r\n" + 
+        "\r\n" + 
+        "<body>\r\n" + 
+        "TEST-000 text\r\n" + 
+        "\r\n" + 
+        "</body>\r\n" + 
+        "\r\n";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("TEST-000 title", parser.title);
+    assertEquals("TEST-000 text", parser.body.trim());
+  }
+  
+  public void testNoHTML() throws Exception {
+    String text = "hallo";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("", parser.title);
+    assertEquals("hallo", parser.body);
+  }
+  
+  public void testivalid() throws Exception {
+    String text = "<title>foo</title>bar";
+    Parser parser = new Parser(new StringReader(text));
+    assertEquals("foo", parser.title);
+    assertEquals("bar", parser.body);
+  }
+  
+}

Modified: lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java?rev=1361741&r1=1361740&r2=1361741&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (original)
+++ lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java Sun Jul 15 17:34:47 2012
@@ -166,6 +166,7 @@ public class TrecContentSourceTest exten
                   "<title>\r\n" + 
                   "TEST-001 title\r\n" + 
                   "</title>\r\n" + 
+                  "<meta name=\"date\" content=\"Tue&#44; 09 Dec 2003 22&#58;39&#58;08 GMT\">" +
                   "</head>\r\n" + 
                   "\r\n" + 
                   "<body>\r\n" + 
@@ -183,7 +184,7 @@ public class TrecContentSourceTest exten
     
     dd = source.getNextDocData(dd);
     assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
-        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+        .parseDate("Tue, 09 Dec 2003 22:39:08 GMT"));
     
     assertNoMoreDataException(source);
   }
@@ -331,6 +332,7 @@ public class TrecContentSourceTest exten
     dd = source.getNextDocData(dd);
     assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
         .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    source.close();
 
     // Don't test that NoMoreDataException is thrown, since the forever flag is
     // turned on.