You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/04/30 08:28:24 UTC
[nutch] branch master updated: NUTCH-2772 Debugging parse filter to
show serialized DOM tree
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new caea3a0 NUTCH-2772 Debugging parse filter to show serialized DOM tree
new 3665345 Merge pull request #500 from sebastian-nagel/NUTCH-2772-parsefilter-debug
caea3a0 is described below
commit caea3a051aceb947d17ccfaa080f6bd864802a4d
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Feb 27 17:14:06 2020 +0100
NUTCH-2772 Debugging parse filter to show serialized DOM tree
---
build.xml | 3 +
default.properties | 1 +
src/java/org/apache/nutch/util/DomUtil.java | 24 +++++---
src/plugin/build.xml | 2 +
src/plugin/parsefilter-debug/build.xml | 22 +++++++
src/plugin/parsefilter-debug/ivy.xml | 37 ++++++++++++
src/plugin/parsefilter-debug/plugin.xml | 41 +++++++++++++
.../nutch/parsefilter/debug/DebugParseFilter.java | 68 ++++++++++++++++++++++
.../nutch/parsefilter/debug/package-info.java | 23 ++++++++
9 files changed, 213 insertions(+), 8 deletions(-)
diff --git a/build.xml b/build.xml
index ae0f111..b54e713 100644
--- a/build.xml
+++ b/build.xml
@@ -210,6 +210,7 @@
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
+ <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -719,6 +720,7 @@
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
+ <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -1131,6 +1133,7 @@
<source path="${plugins.dir}/parse-tika/src/test/" />
<source path="${plugins.dir}/parse-zip/src/java/" />
<source path="${plugins.dir}/parse-zip/src/test/" />
+ <source path="${plugins.dir}/parsefilter-debug/src/java/" />
<source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
<source path="${plugins.dir}/parsefilter-regex/src/java/" />
<source path="${plugins.dir}/parsefilter-regex/src/test/" />
diff --git a/default.properties b/default.properties
index 668f938..1537a01 100644
--- a/default.properties
+++ b/default.properties
@@ -153,6 +153,7 @@ plugins.parse=\
# Parse Filter Plugins
#
plugins.parsefilter=\
+ org.apache.nutch.parsefilter.debug*:\
org.apache.nutch.parse.headings*:\
org.apache.nutch.parsefilter.naivebayes*:\
org.apache.nutch.parsefilter.regex*:\
diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index 2461286..d0bfafd 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -22,7 +22,9 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
+import java.nio.charset.StandardCharsets;
+import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
@@ -33,6 +35,7 @@ import javax.xml.transform.stream.StreamResult;
import org.apache.xerces.parsers.DOMParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
+import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -91,16 +94,12 @@ public class DomUtil {
try {
transformer = transFactory.newTransformer();
transformer.setOutputProperty("indent", "yes");
+ transformer.setOutputProperty(OutputKeys.ENCODING,
+ StandardCharsets.UTF_8.name());
StreamResult result = new StreamResult(os);
transformer.transform(source, result);
os.flush();
- } catch (UnsupportedEncodingException e1) {
- LOG.error("Error: ", e1);
- } catch (IOException e1) {
- LOG.error("Error: ", e1);
- } catch (TransformerConfigurationException e2) {
- LOG.error("Error: ", e2);
- } catch (TransformerException ex) {
+ } catch (IOException | TransformerException ex) {
LOG.error("Error: ", ex);
}
}
@@ -108,7 +107,16 @@ public class DomUtil {
public static void saveDom(OutputStream os, DocumentFragment doc) {
NodeList docChildren = doc.getChildNodes();
for (int i = 0; i < docChildren.getLength(); i++) {
- saveDom(os, (Element) docChildren.item(i));
+ Node child = docChildren.item(i);
+ if (child instanceof Element) {
+ saveDom(os, (Element) child);
+ } else {
+ try {
+ os.write(child.toString().getBytes(StandardCharsets.UTF_8));
+ } catch (IOException ex) {
+ LOG.error("Error: ", ex);
+ }
+ }
}
}
}
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index b0882a5..581a37a 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -68,6 +68,7 @@
<ant dir="parse-swf" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
<ant dir="parse-zip" target="deploy"/>
+ <ant dir="parsefilter-debug" target="deploy"/>
<ant dir="parsefilter-naivebayes" target="deploy"/>
<ant dir="parsefilter-regex" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
@@ -214,6 +215,7 @@
<ant dir="parse-swf" target="clean"/>
<ant dir="parse-tika" target="clean"/>
<ant dir="parse-zip" target="clean"/>
+ <ant dir="parsefilter-debug" target="clean" />
<ant dir="parsefilter-naivebayes" target="clean" />
<ant dir="parsefilter-regex" target="clean"/>
<ant dir="protocol-file" target="clean"/>
diff --git a/src/plugin/parsefilter-debug/build.xml b/src/plugin/parsefilter-debug/build.xml
new file mode 100644
index 0000000..1f175e4
--- /dev/null
+++ b/src/plugin/parsefilter-debug/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-debug" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/parsefilter-debug/ivy.xml b/src/plugin/parsefilter-debug/ivy.xml
new file mode 100644
index 0000000..dac80e6
--- /dev/null
+++ b/src/plugin/parsefilter-debug/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+</ivy-module>
diff --git a/src/plugin/parsefilter-debug/plugin.xml b/src/plugin/parsefilter-debug/plugin.xml
new file mode 100644
index 0000000..bc4a574
--- /dev/null
+++ b/src/plugin/parsefilter-debug/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parsefilter-debug"
+ name="Debugging Parse Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parsefilter-debug.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.htmlparsefilter.regex"
+ name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="DebugParseFilter"
+ class="org.apache.nutch.parsefilter.debug.DebugParseFilter">
+ </implementation>
+ </extension>
+
+</plugin>
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
new file mode 100644
index 0000000..691f894
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.debug;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.lang.invoke.MethodHandles;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DomUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+public class DebugParseFilter implements HtmlParseFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private Configuration conf;
+
+ @Override
+ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DomUtil.saveDom(baos, doc);
+ Parse parse = parseResult.get(content.getUrl());
+ String dom = new String(baos.toByteArray(), StandardCharsets.UTF_8);
+ LOG.debug(dom);
+ parse.getData().getParseMeta().set("DOM", dom);
+ return parseResult;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+}
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
new file mode 100644
index 0000000..bbc24dd
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+package org.apache.nutch.parsefilter.debug;
+