You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/01/30 10:06:23 UTC
svn commit: r1655966 - in /nutch/trunk: CHANGES.txt
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Author: jnioche
Date: Fri Jan 30 09:06:23 2015
New Revision: 1655966
URL: http://svn.apache.org/r1655966
Log:
NUTCH-1918 TikaParser specifies a default namespace when generating DOM
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1655966&r1=1655965&r2=1655966&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 30 09:06:23 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1918 TikaParser specifies a default namespace when generating DOM (jnioche)
+
* NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche)
* NUTCH-865 Format source code in unique style (lewismc)
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1655966&r1=1655965&r2=1655966&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Fri Jan 30 09:06:23 2015
@@ -60,6 +60,12 @@ class DOMBuilder implements ContentHandl
protected Stack<Element> m_elemStack = new Stack<Element>();
/**
+ * Element recorded with this namespace will be converted to Node without a
+ * namespace
+ */
+ private String defaultNamespaceURI = null;
+
+ /**
* DOMBuilder instance constructor... it will add the DOM nodes to the
* document fragment.
*
@@ -269,7 +275,7 @@ class DOMBuilder implements ContentHandl
// Note that the namespace-aware call must be used to correctly
// construct a Level 2 DOM, even for non-namespaced nodes.
- if ((null == ns) || (ns.length() == 0))
+ if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
elem = m_doc.createElementNS(null, name);
else
elem = m_doc.createElementNS(ns, name);
@@ -775,4 +781,12 @@ class DOMBuilder implements ContentHandl
public void setUpperCaseElementNames(boolean upperCaseElementNames) {
this.upperCaseElementNames = upperCaseElementNames;
}
+
+ public String getDefaultNamespaceURI() {
+ return defaultNamespaceURI;
+ }
+
+ public void setDefaultNamespaceURI(String defaultNamespaceURI) {
+ this.defaultNamespaceURI = defaultNamespaceURI;
+ }
}
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1655966&r1=1655965&r2=1655966&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Fri Jan 30 09:06:23 2015
@@ -42,6 +42,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
@@ -96,6 +97,8 @@ public class TikaParser implements org.a
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
domhandler.setUpperCaseElementNames(upperCaseElementNames);
+ domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+
ParseContext context = new ParseContext();
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);