You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/04/06 21:13:08 UTC
svn commit: r160319 -
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Author: cutting
Date: Wed Apr 6 12:13:07 2005
New Revision: 160319
URL: http://svn.apache.org/viewcvs?view=rev&rev=160319
Log:
Changed to use NekoHTML's DOMParser instead of its DOMFragmentParser.
For some reason, the DOMFragmentParser can be very slow with large
documents while the DOMParser has no problems with these. Also added
a main() that permits easier debugging.
Modified:
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Modified: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&r1=160318&r2=160319
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Apr 6 12:13:07 2005
@@ -21,8 +21,7 @@
import java.util.logging.*;
import java.net.URL;
import java.net.MalformedURLException;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
+import java.io.*;
import java.util.regex.*;
import org.cyberneko.html.parsers.*;
@@ -96,7 +95,7 @@
NutchConf.get().get("parser.character.encoding.default", "windows-1252");
public Parse getParse(Content content) throws ParseException {
- DOMFragmentParser parser = new DOMFragmentParser();
+ DOMParser parser = new DOMParser();
// some plugins, e.g., creativecommons, need to examine html comments
try {
@@ -124,9 +123,7 @@
throw new ParseException("Content-Type not text/html: " + contentType);
// parse the content
- HTMLDocumentImpl impl = new HTMLDocumentImpl();
- impl.setErrorChecking(false);
- DocumentFragment root = impl.createDocumentFragment();
+ DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input =
@@ -167,7 +164,13 @@
LOG.fine(base + ": falling back to " + defaultCharEncoding);
}
- parser.parse(input, root);
+ LOG.fine("Parsing...");
+ parser.parse(input);
+
+ // convert Document to DocumentFragment
+ Document doc = parser.getDocument();
+ root = doc.createDocumentFragment();
+ root.appendChild(doc.getDocumentElement());
} catch (IOException e) {
throw new ParseException(e);
} catch (DOMException e) {
@@ -182,9 +185,11 @@
// check meta directives
if (!robotsMeta.getNoIndex()) { // okay to index
StringBuffer sb = new StringBuffer();
+ LOG.fine("Getting text...");
DOMContentUtils.getText(sb, root); // extract text
text = sb.toString();
sb.setLength(0);
+ LOG.fine("Getting title...");
DOMContentUtils.getTitle(sb, root); // extract title
title = sb.toString().trim();
}
@@ -192,6 +197,7 @@
if (!robotsMeta.getNoFollow()) { // okay to follow links
ArrayList l = new ArrayList(); // extract outlinks
URL baseTag = DOMContentUtils.getBase(root);
+ LOG.fine("Getting links...");
DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root);
outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
@@ -209,5 +215,21 @@
// run filters on parse
return HtmlParseFilters.filter(content, parse, root);
+ }
+
+ public static void main(String[] args) throws Exception {
+ LOG.setLevel(Level.FINE);
+ String name = args[0];
+ String url = "file:"+name;
+ File file = new File(name);
+ byte[] bytes = new byte[(int)file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ Parse parse = new HtmlParser().getParse(new Content(url,url,
+ bytes,"text/html",
+ new Properties()));
+ System.out.println("text length: "+parse.getText().length());
+ System.out.println("links: "+parse.getData().getOutlinks().length);
+
}
}