You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/01/30 10:06:23 UTC

svn commit: r1655966 - in /nutch/trunk: CHANGES.txt src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Author: jnioche
Date: Fri Jan 30 09:06:23 2015
New Revision: 1655966

URL: http://svn.apache.org/r1655966
Log:
NUTCH-1918 TikaParser specifies a default namespace when generating DOM

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1655966&r1=1655965&r2=1655966&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 30 09:06:23 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1918 TikaParser specifies a default namespace when generating DOM (jnioche)
+
 * NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche)
 
 * NUTCH-865 Format source code in unique style (lewismc)

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1655966&r1=1655965&r2=1655966&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Fri Jan 30 09:06:23 2015
@@ -60,6 +60,12 @@ class DOMBuilder implements ContentHandl
   protected Stack<Element> m_elemStack = new Stack<Element>();
 
   /**
+  * Element recorded with this namespace will be converted to Node without a
+  * namespace
+  */
+  private String defaultNamespaceURI = null;
+
+  /**
    * DOMBuilder instance constructor... it will add the DOM nodes to the
    * document fragment.
    * 
@@ -269,7 +275,7 @@ class DOMBuilder implements ContentHandl
 
     // Note that the namespace-aware call must be used to correctly
     // construct a Level 2 DOM, even for non-namespaced nodes.
-    if ((null == ns) || (ns.length() == 0))
+    if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
       elem = m_doc.createElementNS(null, name);
     else
       elem = m_doc.createElementNS(ns, name);
@@ -775,4 +781,12 @@ class DOMBuilder implements ContentHandl
   public void setUpperCaseElementNames(boolean upperCaseElementNames) {
     this.upperCaseElementNames = upperCaseElementNames;
   }
+ 
+  public String getDefaultNamespaceURI() {
+    return defaultNamespaceURI;
+  }
+
+  public void setDefaultNamespaceURI(String defaultNamespaceURI) {
+    this.defaultNamespaceURI = defaultNamespaceURI;
+  }
 }

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1655966&r1=1655965&r2=1655966&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Fri Jan 30 09:06:23 2015
@@ -42,6 +42,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
@@ -96,6 +97,8 @@ public class TikaParser implements org.a
     DocumentFragment root = doc.createDocumentFragment();
     DOMBuilder domhandler = new DOMBuilder(doc, root);
     domhandler.setUpperCaseElementNames(upperCaseElementNames);
+    domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+
     ParseContext context = new ParseContext();
     if (HTMLMapper != null)
       context.set(HtmlMapper.class, HTMLMapper);