You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/07/15 19:38:26 UTC
svn commit: r1361743 - in /lucene/dev/branches/branch_4x: ./ dev-tools/
dev-tools/eclipse/ lucene/ lucene/benchmark/ lucene/benchmark/lib/
lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/
lucene/benchmark/src/java/org/apache/lucene/b...
Author: uschindler
Date: Sun Jul 15 17:38:25 2012
New Revision: 1361743
URL: http://svn.apache.org/viewvc?rev=1361743&view=rev
Log:
Merged revision(s) 1361741 from lucene/dev/trunk:
LUCENE-4220: Remove the buggy JavaCC-based HTML parser in the benchmark module and replaced by NekoHTML
Added:
lucene/dev/branches/branch_4x/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1
- copied unchanged from r1361741, lucene/dev/trunk/lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1
lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java
- copied unchanged from r1361741, lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java
Removed:
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/eclipse/dot.classpath
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/build.xml
lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
Modified: lucene/dev/branches/branch_4x/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/eclipse/dot.classpath?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/branches/branch_4x/dev-tools/eclipse/dot.classpath Sun Jul 15 17:38:25 2012
@@ -102,6 +102,7 @@
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
+ <classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sun Jul 15 17:38:25 2012
@@ -28,6 +28,10 @@ API Changes
make a custom FieldType and set indexed = true, its analyzed by the analyzer.
(Robert Muir)
+* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
+ module and replaced by NekoHTML. HTMLParser interface was cleaned up while
+ changing method signatures. (Uwe Schindler, Robert Muir)
+
Optimizations
* LUCENE-4171: Performance improvements to Packed64.
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/build.xml?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/build.xml Sun Jul 15 17:38:25 2012
@@ -155,6 +155,7 @@
<fileset dir="lib">
<include name="commons-compress-1.2.jar"/>
<include name="xercesImpl-2.9.1.jar"/>
+ <include name="nekohtml-1.9.15.jar"/>
</fileset>
</path>
<path id="run.classpath">
@@ -261,20 +262,6 @@
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
- <target name="clean-javacc">
- <delete>
- <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
- <containsregexp expression="Generated.*By.*JavaCC"/>
- </fileset>
- </delete>
- </target>
-
- <target name="javacc" depends="init,javacc-check" if="javacc.present">
- <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
- outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
- />
- </target>
-
<target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
<target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
<copy todir="${build.dir}/classes/test/conf">
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/ivy.xml Sun Jul 15 17:38:25 2012
@@ -21,6 +21,7 @@
<dependencies>
<dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
<dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
+ <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Sun Jul 15 17:38:25 2012
@@ -19,51 +19,203 @@ package org.apache.lucene.benchmark.byTa
import java.io.IOException;
import java.io.Reader;
-import java.text.DateFormat;
-import java.text.ParseException;
+import java.io.StringReader;
+import java.util.Collections;
import java.util.Date;
+import java.util.HashSet;
+import java.util.Locale;
import java.util.Properties;
+import java.util.Set;
+
+import org.cyberneko.html.parsers.SAXParser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
- * HTML Parser that is based on Lucene's demo HTML parser.
+ * Simple HTML Parser extracting title, meta tags, and body text
+ * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
*/
-public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
+public class DemoHTMLParser implements HTMLParser {
+
+ /** The actual parser to read HTML documents */
+ public static final class Parser {
+
+ public final Properties metaTags = new Properties();
+ public final String title, body;
+
+ public Parser(Reader reader) throws IOException, SAXException {
+ this(new InputSource(reader));
+ }
+
+ public Parser(InputSource source) throws IOException, SAXException {
+ final SAXParser parser = new SAXParser();
+ parser.setFeature("http://xml.org/sax/features/namespaces", true);
+ parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
+ parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
+ parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+ parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
- public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
- org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader);
+ final StringBuilder title = new StringBuilder(), body = new StringBuilder();
+ final DefaultHandler handler = new DefaultHandler() {
+ private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
+ if (inHEAD > 0) {
+ if (equalsIgnoreTurkish("title", localName)) {
+ inTITLE++;
+ } else {
+ if (equalsIgnoreTurkish("meta", localName)) {
+ String name = atts.getValue("name");
+ if (name == null) {
+ name = atts.getValue("http-equiv");
+ }
+ final String val = atts.getValue("content");
+ if (name != null && val != null) {
+ metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
+ }
+ }
+ }
+ } else if (inBODY > 0) {
+ if (SUPPRESS_ELEMENTS.contains(localName)) {
+ suppressed++;
+ } else if (equalsIgnoreTurkish("img", localName)) {
+ // the original javacc-based parser preserved <IMG alt="..."/>
+ // attribute as body text in [] parenthesis:
+ final String alt = atts.getValue("alt");
+ if (alt != null) {
+ body.append('[').append(alt).append(']');
+ }
+ }
+ } else if (equalsIgnoreTurkish("body", localName)) {
+ inBODY++;
+ } else if (equalsIgnoreTurkish("head", localName)) {
+ inHEAD++;
+ } else if (equalsIgnoreTurkish("frameset", localName)) {
+ throw new SAXException("This parser does not support HTML framesets.");
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
+ if (inBODY > 0) {
+ if (equalsIgnoreTurkish("body", localName)) {
+ inBODY--;
+ } else if (ENDLINE_ELEMENTS.contains(localName)) {
+ body.append('\n');
+ } else if (SUPPRESS_ELEMENTS.contains(localName)) {
+ suppressed--;
+ }
+ } else if (inHEAD > 0) {
+ if (equalsIgnoreTurkish("head", localName)) {
+ inHEAD--;
+ } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
+ inTITLE--;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (inBODY > 0 && suppressed == 0) {
+ body.append(ch, start, length);
+ } else if (inTITLE > 0) {
+ title.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId) {
+ // disable network access caused by DTDs
+ return new InputSource(new StringReader(""));
+ }
+ };
+
+ parser.setContentHandler(handler);
+ parser.setErrorHandler(handler);
+ parser.parse(source);
+
+ // the javacc-based parser trimmed title (which should be done for HTML in all cases):
+ this.title = title.toString().trim();
+
+ // assign body text
+ this.body = body.toString();
+ }
+
+ // TODO: remove the Turkish workaround once this is fixed in NekoHTML:
+ // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
- // title
- if (title==null) {
- title = p.getTitle();
+ // BEGIN: workaround
+ static final String convertTurkish(String s) {
+ return s.replace('i', 'ı');
}
- // properties
- Properties props = p.getMetaTags();
- // body
- Reader r = p.getReader();
- char c[] = new char[1024];
- StringBuilder bodyBuf = new StringBuilder();
- int n;
- while ((n = r.read(c)) >= 0) {
- if (n>0) {
- bodyBuf.append(c,0,n);
+ static final boolean equalsIgnoreTurkish(String s1, String s2) {
+ final int len1 = s1.length(), len2 = s2.length();
+ if (len1 != len2)
+ return false;
+ for (int i = 0; i < len1; i++) {
+ char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
+ if (ch1 == 'ı') ch1 = 'i';
+ if (ch2 == 'ı') ch2 = 'i';
+ if (ch1 != ch2)
+ return false;
}
+ return true;
}
- r.close();
- if (date == null && props.getProperty("date")!=null) {
- try {
- date = dateFormat.parse(props.getProperty("date").trim());
- } catch (ParseException e) {
- // do not fail test just because a date could not be parsed
- System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
- date = new Date(); // now
+ // END: workaround
+
+ static final Set<String> createElementNameSet(String... names) {
+ final HashSet<String> set = new HashSet<String>();
+ for (final String name : names) {
+ set.add(name);
+ set.add(convertTurkish(name));
+ }
+ return Collections.unmodifiableSet(set);
+ }
+
+ /** HTML elements that cause a line break (they are block-elements) */
+ static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
+ "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+ "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+ "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
+ );
+
+ /** HTML elements with contents that are ignored */
+ static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
+ "style", "script"
+ );
+ }
+
+ @Override
+ public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
+ try {
+ return parse(docData, name, date, new InputSource(reader), trecSrc);
+ } catch (SAXException saxe) {
+ throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
+ }
+ }
+
+ public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
+ final Parser p = new Parser(source);
+
+ // properties
+ final Properties props = p.metaTags;
+ String dateStr = props.getProperty("date");
+ if (dateStr != null) {
+ final Date newDate = trecSrc.parseDate(dateStr);
+ if (newDate != null) {
+ date = newDate;
}
}
docData.clear();
docData.setName(name);
- docData.setBody(bodyBuf.toString());
- docData.setTitle(title);
+ docData.setBody(p.body);
+ docData.setTitle(p.title);
docData.setProps(props);
docData.setDate(date);
return docData;
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Sun Jul 15 17:38:25 2012
@@ -19,7 +19,6 @@ package org.apache.lucene.benchmark.byTa
import java.io.IOException;
import java.io.Reader;
-import java.text.DateFormat;
import java.util.Date;
/**
@@ -34,13 +33,11 @@ public interface HTMLParser {
* @param docData result reused
* @param name name of the result doc data.
* @param date date of the result doc data. If null, attempt to set by parsed data.
- * @param title title of the result doc data. If null, attempt to set by parsed data.
* @param reader reader of html text to parse.
- * @param dateFormat date formatter to use for extracting the date.
+ * @param trecSrc the {@link TrecContentSource} used to parse dates.
* @return Parsed doc data.
* @throws IOException
- * @throws InterruptedException
*/
- public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+ public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Sun Jul 15 17:38:25 2012
@@ -22,7 +22,6 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.Reader;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@@ -33,8 +32,6 @@ import java.util.Locale;
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
-import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
-import org.apache.lucene.util.ThreadInterruptedException;
/**
* Implements a {@link ContentSource} over the TREC collection.
@@ -57,7 +54,7 @@ import org.apache.lucene.util.ThreadInte
*/
public class TrecContentSource extends ContentSource {
- private static final class DateFormatInfo {
+ static final class DateFormatInfo {
DateFormat[] dfs;
ParsePosition pos;
}
@@ -83,13 +80,10 @@ public class TrecContentSource extends C
};
private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
- private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
private File dataDir = null;
private ArrayList<File> inputFiles = new ArrayList<File>();
private int nextFile = 0;
- private int rawDocSize = 0;
-
// Use to synchronize threads on reading from the TREC documents.
private Object lock = new Object();
@@ -126,17 +120,6 @@ public class TrecContentSource extends C
return sb;
}
- Reader getTrecDocReader(StringBuilder docBuffer) {
- StringBuilderReader r = trecDocReader.get();
- if (r == null) {
- r = new StringBuilderReader(docBuffer);
- trecDocReader.set(r);
- } else {
- r.set(docBuffer);
- }
- return r;
- }
-
HTMLParser getHtmlParser() {
return htmlParser;
}
@@ -161,7 +144,7 @@ public class TrecContentSource extends C
continue;
}
- rawDocSize += line.length();
+ line.length();
if (lineStart!=null && line.startsWith(lineStart)) {
if (collectMatchLine) {
@@ -287,12 +270,8 @@ public class TrecContentSource extends C
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
- try {
- docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
- addItem();
- } catch (InterruptedException ie) {
- throw new ThreadInterruptedException(ie);
- }
+ docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
+ addItem();
return docData;
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Sun Jul 15 17:38:25 2012
@@ -80,7 +80,7 @@ public abstract class TrecDocParser {
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
+ StringBuilder docBuf, ParsePathType pathType) throws IOException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java Sun Jul 15 17:38:25 2012
@@ -37,7 +37,7 @@ public class TrecFBISParser extends Trec
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date, title
Date date = null;
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java Sun Jul 15 17:38:25 2012
@@ -41,7 +41,7 @@ public class TrecFR94Parser extends Trec
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date (no title?)
Date date = null;
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java Sun Jul 15 17:38:25 2012
@@ -33,7 +33,7 @@ public class TrecFTParser extends TrecDo
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// date...
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java Sun Jul 15 17:38:25 2012
@@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTa
*/
import java.io.IOException;
-import java.io.Reader;
+import java.io.StringReader;
import java.util.Date;
/**
@@ -31,29 +31,24 @@ public class TrecGov2Parser extends Trec
private static final String DOCHDR = "<DOCHDR>";
private static final String TERMINATING_DOCHDR = "</DOCHDR>";
- private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- // Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
- Reader r = trecSrc.getTrecDocReader(docBuf);
-
- // skip some of the text, optionally set date
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
+ // skip some of the non-html text, optionally set date
Date date = null;
- int h1 = docBuf.indexOf(DOCHDR);
- if (h1>=0) {
- int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
- String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
+ int start = 0;
+ final int h1 = docBuf.indexOf(DOCHDR);
+ if (h1 >= 0) {
+ final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
+ final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
- r.mark(h2+TERMINATING_DOCHDR_LENGTH);
+ start = h2 + TERMINATING_DOCHDR.length();
}
-
- r.reset();
- HTMLParser htmlParser = trecSrc.getHtmlParser();
- return htmlParser.parse(docData, name, date, null, r, null);
+ final String html = docBuf.substring(start);
+ return trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java Sun Jul 15 17:38:25 2012
@@ -36,7 +36,7 @@ public class TrecLATimesParser extends T
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// date...
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java Sun Jul 15 17:38:25 2012
@@ -26,7 +26,7 @@ public class TrecParserByPath extends Tr
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java?rev=1361743&r1=1361742&r2=1361743&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java Sun Jul 15 17:38:25 2012
@@ -166,6 +166,7 @@ public class TrecContentSourceTest exten
"<title>\r\n" +
"TEST-001 title\r\n" +
"</title>\r\n" +
+ "<meta name=\"date\" content=\"Tue, 09 Dec 2003 22:39:08 GMT\">" +
"</head>\r\n" +
"\r\n" +
"<body>\r\n" +
@@ -183,7 +184,7 @@ public class TrecContentSourceTest exten
dd = source.getNextDocData(dd);
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
- .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+ .parseDate("Tue, 09 Dec 2003 22:39:08 GMT"));
assertNoMoreDataException(source);
}
@@ -331,6 +332,7 @@ public class TrecContentSourceTest exten
dd = source.getNextDocData(dd);
assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+ source.close();
// Don't test that NoMoreDataException is thrown, since the forever flag is
// turned on.