You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/04/30 08:28:24 UTC

[nutch] branch master updated: NUTCH-2772 Debugging parse filter to show serialized DOM tree

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new caea3a0  NUTCH-2772 Debugging parse filter to show serialized DOM tree
     new 3665345  Merge pull request #500 from sebastian-nagel/NUTCH-2772-parsefilter-debug
caea3a0 is described below

commit caea3a051aceb947d17ccfaa080f6bd864802a4d
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Feb 27 17:14:06 2020 +0100

    NUTCH-2772 Debugging parse filter to show serialized DOM tree
---
 build.xml                                          |  3 +
 default.properties                                 |  1 +
 src/java/org/apache/nutch/util/DomUtil.java        | 24 +++++---
 src/plugin/build.xml                               |  2 +
 src/plugin/parsefilter-debug/build.xml             | 22 +++++++
 src/plugin/parsefilter-debug/ivy.xml               | 37 ++++++++++++
 src/plugin/parsefilter-debug/plugin.xml            | 41 +++++++++++++
 .../nutch/parsefilter/debug/DebugParseFilter.java  | 68 ++++++++++++++++++++++
 .../nutch/parsefilter/debug/package-info.java      | 23 ++++++++
 9 files changed, 213 insertions(+), 8 deletions(-)

diff --git a/build.xml b/build.xml
index ae0f111..b54e713 100644
--- a/build.xml
+++ b/build.xml
@@ -210,6 +210,7 @@
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -719,6 +720,7 @@
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -1131,6 +1133,7 @@
         <source path="${plugins.dir}/parse-tika/src/test/" />
         <source path="${plugins.dir}/parse-zip/src/java/" />
         <source path="${plugins.dir}/parse-zip/src/test/" />
+        <source path="${plugins.dir}/parsefilter-debug/src/java/" />
         <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
         <source path="${plugins.dir}/parsefilter-regex/src/java/" />
         <source path="${plugins.dir}/parsefilter-regex/src/test/" />
diff --git a/default.properties b/default.properties
index 668f938..1537a01 100644
--- a/default.properties
+++ b/default.properties
@@ -153,6 +153,7 @@ plugins.parse=\
 # Parse Filter Plugins
 #
 plugins.parsefilter=\
+   org.apache.nutch.parsefilter.debug*:\
    org.apache.nutch.parse.headings*:\
    org.apache.nutch.parsefilter.naivebayes*:\
    org.apache.nutch.parsefilter.regex*:\
diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index 2461286..d0bfafd 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -22,7 +22,9 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
+import java.nio.charset.StandardCharsets;
 
+import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerException;
@@ -33,6 +35,7 @@ import javax.xml.transform.stream.StreamResult;
 import org.apache.xerces.parsers.DOMParser;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
+import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -91,16 +94,12 @@ public class DomUtil {
     try {
       transformer = transFactory.newTransformer();
       transformer.setOutputProperty("indent", "yes");
+      transformer.setOutputProperty(OutputKeys.ENCODING,
+          StandardCharsets.UTF_8.name());
       StreamResult result = new StreamResult(os);
       transformer.transform(source, result);
       os.flush();
-    } catch (UnsupportedEncodingException e1) {
-      LOG.error("Error: ", e1);
-    } catch (IOException e1) {
-      LOG.error("Error: ", e1);
-    } catch (TransformerConfigurationException e2) {
-      LOG.error("Error: ", e2);
-    } catch (TransformerException ex) {
+    } catch (IOException | TransformerException ex) {
       LOG.error("Error: ", ex);
     }
   }
@@ -108,7 +107,16 @@ public class DomUtil {
   public static void saveDom(OutputStream os, DocumentFragment doc) {
     NodeList docChildren = doc.getChildNodes();
     for (int i = 0; i < docChildren.getLength(); i++) {
-      saveDom(os, (Element) docChildren.item(i));
+      Node child = docChildren.item(i);
+      if (child instanceof Element) {
+        saveDom(os, (Element) child);
+      } else {
+        try {
+          os.write(child.toString().getBytes(StandardCharsets.UTF_8));
+        } catch (IOException ex) {
+          LOG.error("Error: ", ex);
+        }
+      }
     }
   }
 }
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index b0882a5..581a37a 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -68,6 +68,7 @@
     <ant dir="parse-swf" target="deploy"/>
     <ant dir="parse-tika" target="deploy"/>
     <ant dir="parse-zip" target="deploy"/>
+    <ant dir="parsefilter-debug" target="deploy"/>
     <ant dir="parsefilter-naivebayes" target="deploy"/>
     <ant dir="parsefilter-regex" target="deploy"/>
     <ant dir="protocol-file" target="deploy"/>
@@ -214,6 +215,7 @@
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
+    <ant dir="parsefilter-debug" target="clean" />
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="parsefilter-regex" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
diff --git a/src/plugin/parsefilter-debug/build.xml b/src/plugin/parsefilter-debug/build.xml
new file mode 100644
index 0000000..1f175e4
--- /dev/null
+++ b/src/plugin/parsefilter-debug/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-debug" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/parsefilter-debug/ivy.xml b/src/plugin/parsefilter-debug/ivy.xml
new file mode 100644
index 0000000..dac80e6
--- /dev/null
+++ b/src/plugin/parsefilter-debug/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+</ivy-module>
diff --git a/src/plugin/parsefilter-debug/plugin.xml b/src/plugin/parsefilter-debug/plugin.xml
new file mode 100644
index 0000000..bc4a574
--- /dev/null
+++ b/src/plugin/parsefilter-debug/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-debug"
+   name="Debugging Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-debug.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.regex"
+        name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="DebugParseFilter" 
+                      class="org.apache.nutch.parsefilter.debug.DebugParseFilter">
+      </implementation>
+   </extension>
+
+</plugin>
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
new file mode 100644
index 0000000..691f894
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.debug;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.lang.invoke.MethodHandles;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DomUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+public class DebugParseFilter implements HtmlParseFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private Configuration conf;
+
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DomUtil.saveDom(baos, doc);
+    Parse parse = parseResult.get(content.getUrl());
+    String dom = new String(baos.toByteArray(), StandardCharsets.UTF_8);
+    LOG.debug(dom);
+    parse.getData().getParseMeta().set("DOM", dom);
+    return parseResult;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+}
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
new file mode 100644
index 0000000..bbc24dd
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+package org.apache.nutch.parsefilter.debug;
+