You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:50:02 UTC

[nutch] 18/23: NUTCH-2478 HTML parser should resolve base URL - finally fix parse-tika: - href attribute of base element dropped in DOM - need to call tikamd.get("Content-Location") - port HTML parser test from parse-html to parse-tika - add method to DomUtil which prints DocumentFragment

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 8f692d13d45642f8b447d47af796f06487afeec2
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Fri Dec 15 21:35:27 2017 +0100

    NUTCH-2478 HTML parser should resolve base URL <base href=...>
    - finally fix parse-tika:
      - href attribute of base element dropped in DOM
      - need to call tikamd.get("Content-Location")
    - port HTML parser test from parse-html to parse-tika
    - add method to DomUtil which prints DocumentFragment
---
 src/java/org/apache/nutch/util/DomUtil.java            |  9 +++++++++
 .../java/org/apache/nutch/parse/html/HtmlParser.java   | 13 ++++++++-----
 .../org/apache/nutch/parse/html/TestHtmlParser.java    |  2 +-
 .../java/org/apache/nutch/parse/tika/TikaParser.java   | 18 +++++++++++-------
 .../test/org/apache/nutch/tika}/TestHtmlParser.java    | 10 +++++-----
 5 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index e93477a..b4f0eac 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -31,7 +31,9 @@ import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
@@ -103,4 +105,11 @@ public class DomUtil {
       LOG.error("Error: ", ex);
     }
   }
+
+  public static void saveDom(OutputStream os, DocumentFragment doc) {
+    NodeList docChildren = doc.getChildNodes();
+    for (int i = 0; i < docChildren.getLength(); i++) {
+      saveDom(os, (Element) docChildren.item(i));
+    }
+  }
 }
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index e940eb1..9ed9fa4 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -207,11 +207,14 @@ public class HtmlParser implements Parser {
 
     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
-      URL baseTag = null;
-      try {
-        baseTag = new URL(base, utils.getBase(root));
-      } catch (MalformedURLException e) {
-        baseTag = base;
+      URL baseTag = base;
+      String baseTagHref = utils.getBase(root);
+      if (baseTagHref != null) {
+        try {
+          baseTag = new URL(base, baseTagHref);
+        } catch (MalformedURLException e) {
+          baseTag = base;
+        }
       }
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting links...");
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index 8fe94e6..a4c8206 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -81,7 +81,7 @@ public class TestHtmlParser {
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
 
   private static final String resolveBaseUrlTestContent = //
-      "<html>\\n<head>\n" + //
+      "<html>\n<head>\n" + //
       "  <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
       "  <base href=\"//www.example.com/\">\n" + //
       "</head>\n<body>\n" + //
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 1173504..ea864be 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -52,6 +52,7 @@ import org.apache.tika.sax.TeeContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
 import org.xml.sax.ContentHandler;
 
 /**
@@ -170,21 +171,24 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
 
     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
-      URL baseTag = null;
-      try {
-        baseTag = new URL(base, utils.getBase(root));
-      } catch (MalformedURLException e) {
-        baseTag = base;
+      URL baseTag = base;
+      String baseTagHref = tikamd.get("Content-Location");
+      if (baseTagHref != null) {
+        try {
+          baseTag = new URL(base, baseTagHref);
+        } catch (MalformedURLException e) {
+          LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
+        }
       }
       if (LOG.isTraceEnabled()) {
-        LOG.trace("Getting links...");
+        LOG.trace("Getting links (base URL = {}) ...", baseTag);
       }
       
       // pre-1233 outlink extraction
       //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
       // Get outlinks from Tika
       List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
-      utils.getOutlinks(baseTag, l, root);
+      utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
         LOG.trace("found " + outlinks.length + " outlinks in "
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
similarity index 96%
copy from src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
copy to src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
index 8fe94e6..d2bc816 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.parse.html;
+package org.apache.nutch.tika;
 
 import java.lang.invoke.MethodHandles;
 import java.nio.charset.Charset;
@@ -23,7 +23,7 @@ import java.nio.charset.StandardCharsets;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.html.HtmlParser;
+import org.apache.nutch.parse.tika.TikaParser;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
@@ -81,7 +81,7 @@ public class TestHtmlParser {
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
 
   private static final String resolveBaseUrlTestContent = //
-      "<html>\\n<head>\n" + //
+      "<html>\n<head>\n" + //
       "  <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
       "  <base href=\"//www.example.com/\">\n" + //
       "</head>\n<body>\n" + //
@@ -93,8 +93,8 @@ public class TestHtmlParser {
 
   public TestHtmlParser() {
     conf = NutchConfiguration.create();
-    conf.set("plugin.includes", "parse-html");
-    parser = new HtmlParser();
+    conf.set("plugin.includes", "parse-tika");
+    parser = new TikaParser();
     parser.setConf(conf);
   }
 

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.