You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:50:02 UTC
[nutch] 18/23: NUTCH-2478 HTML parser should resolve base URL - finally fix parse-tika: - href attribute of base element
dropped in DOM - need to call tikamd.get("Content-Location") - port HTML
parser test from parse-html to parse-tika - add method to DomUtil which
prints DocumentFragment
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 8f692d13d45642f8b447d47af796f06487afeec2
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Fri Dec 15 21:35:27 2017 +0100
NUTCH-2478 HTML parser should resolve base URL <base href=...>
- finally fix parse-tika:
- href attribute of base element dropped in DOM
- need to call tikamd.get("Content-Location")
- port HTML parser test from parse-html to parse-tika
- add method to DomUtil which prints DocumentFragment
---
src/java/org/apache/nutch/util/DomUtil.java | 9 +++++++++
.../java/org/apache/nutch/parse/html/HtmlParser.java | 13 ++++++++-----
.../org/apache/nutch/parse/html/TestHtmlParser.java | 2 +-
.../java/org/apache/nutch/parse/tika/TikaParser.java | 18 +++++++++++-------
.../test/org/apache/nutch/tika}/TestHtmlParser.java | 10 +++++-----
5 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index e93477a..b4f0eac 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -31,7 +31,9 @@ import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -103,4 +105,11 @@ public class DomUtil {
LOG.error("Error: ", ex);
}
}
+
+ public static void saveDom(OutputStream os, DocumentFragment doc) {
+ NodeList docChildren = doc.getChildNodes();
+ for (int i = 0; i < docChildren.getLength(); i++) {
+ saveDom(os, (Element) docChildren.item(i));
+ }
+ }
}
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index e940eb1..9ed9fa4 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -207,11 +207,14 @@ public class HtmlParser implements Parser {
if (!metaTags.getNoFollow()) { // okay to follow links
ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
- URL baseTag = null;
- try {
- baseTag = new URL(base, utils.getBase(root));
- } catch (MalformedURLException e) {
- baseTag = base;
+ URL baseTag = base;
+ String baseTagHref = utils.getBase(root);
+ if (baseTagHref != null) {
+ try {
+ baseTag = new URL(base, baseTagHref);
+ } catch (MalformedURLException e) {
+ baseTag = base;
+ }
}
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index 8fe94e6..a4c8206 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -81,7 +81,7 @@ public class TestHtmlParser {
"\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
private static final String resolveBaseUrlTestContent = //
- "<html>\\n<head>\n" + //
+ "<html>\n<head>\n" + //
" <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
" <base href=\"//www.example.com/\">\n" + //
"</head>\n<body>\n" + //
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 1173504..ea864be 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -52,6 +52,7 @@ import org.apache.tika.sax.TeeContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
import org.xml.sax.ContentHandler;
/**
@@ -170,21 +171,24 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
if (!metaTags.getNoFollow()) { // okay to follow links
ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
- URL baseTag = null;
- try {
- baseTag = new URL(base, utils.getBase(root));
- } catch (MalformedURLException e) {
- baseTag = base;
+ URL baseTag = base;
+ String baseTagHref = tikamd.get("Content-Location");
+ if (baseTagHref != null) {
+ try {
+ baseTag = new URL(base, baseTagHref);
+ } catch (MalformedURLException e) {
+ LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
+ }
}
if (LOG.isTraceEnabled()) {
- LOG.trace("Getting links...");
+ LOG.trace("Getting links (base URL = {}) ...", baseTag);
}
// pre-1233 outlink extraction
//utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
// Get outlinks from Tika
List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
- utils.getOutlinks(baseTag, l, root);
+ utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in "
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
similarity index 96%
copy from src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
copy to src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
index 8fe94e6..d2bc816 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nutch.parse.html;
+package org.apache.nutch.tika;
import java.lang.invoke.MethodHandles;
import java.nio.charset.Charset;
@@ -23,7 +23,7 @@ import java.nio.charset.StandardCharsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.html.HtmlParser;
+import org.apache.nutch.parse.tika.TikaParser;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
@@ -81,7 +81,7 @@ public class TestHtmlParser {
"\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
private static final String resolveBaseUrlTestContent = //
- "<html>\\n<head>\n" + //
+ "<html>\n<head>\n" + //
" <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
" <base href=\"//www.example.com/\">\n" + //
"</head>\n<body>\n" + //
@@ -93,8 +93,8 @@ public class TestHtmlParser {
public TestHtmlParser() {
conf = NutchConfiguration.create();
- conf.set("plugin.includes", "parse-html");
- parser = new HtmlParser();
+ conf.set("plugin.includes", "parse-tika");
+ parser = new TikaParser();
parser.setConf(conf);
}
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.