You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/12/11 12:40:40 UTC

svn commit: r1644604 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Author: jnioche
Date: Thu Dec 11 11:40:40 2014
New Revision: 1644604

URL: http://svn.apache.org/r1644604
Log:
NUTCH-1592 TikaParser can uppercase the element names while generating the DOM

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 11 11:40:40 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1592 TikaParser can uppercase the element names while generating the DOM (jnioche)
+
 * NUTCH-1877 Suffix URL filter to ignore query string by default (markus via snagel)
 
 * NUTCH-1890 Major Typo in Documentation for Integrating Nutch and Solr (Boadu Akoto Charles Jnr, mattmann)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Dec 11 11:40:40 2014
@@ -1200,6 +1200,15 @@
 </property>
 -->
 
+<property>
+  <name>tika.uppercase.element.names</name>
+  <value>true</value>
+  <description>Determines whether TikaParser should uppercase the element name while generating the DOM
+  for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592).
+  </description>
+</property>
+
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Thu Dec 11 11:40:40 2014
@@ -47,6 +47,7 @@ import org.xml.sax.ext.LexicalHandler;
 class DOMBuilder
         implements ContentHandler, LexicalHandler
 {
+    private boolean upperCaseElementNames = true;
 
   /** Root document          */
   public Document m_doc;
@@ -265,7 +266,10 @@ class DOMBuilder
   {
 
     Element elem;
-
+    
+    if (upperCaseElementNames)
+        name = name.toUpperCase();
+    
 	// Note that the namespace-aware call must be used to correctly
 	// construct a Level 2 DOM, even for non-namespaced nodes.
     if ((null == ns) || (ns.length() == 0))
@@ -737,4 +741,12 @@ class DOMBuilder
    *        parameter entity, the name will begin with '%'.
    */
   public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+  
+  public boolean isUpperCaseElementNames() {
+      return upperCaseElementNames;
+  }
+
+  public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+      this.upperCaseElementNames = upperCaseElementNames;
+  }
 }

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Thu Dec 11 11:40:40 2014
@@ -61,6 +61,7 @@ public class TikaParser implements org.a
 	private HtmlParseFilters htmlParseFilters;
 	private String cachingPolicy;
 	private HtmlMapper HTMLMapper;
+	private boolean upperCaseElementNames = true;
 
 	@SuppressWarnings("deprecation")
 	public ParseResult getParse(Content content) {
@@ -95,6 +96,7 @@ public class TikaParser implements org.a
 		doc.setErrorChecking(false);
 		DocumentFragment root = doc.createDocumentFragment();
 		DOMBuilder domhandler = new DOMBuilder(doc, root);
+		domhandler.setUpperCaseElementNames(upperCaseElementNames);
 		ParseContext context = new ParseContext();
 		if (HTMLMapper != null)
 			context.set(HtmlMapper.class, HTMLMapper);
@@ -242,7 +244,8 @@ public class TikaParser implements org.a
 		this.utils = new DOMContentUtils(conf);
 		this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
 				Nutch.CACHING_FORBIDDEN_CONTENT);
-
+		this.upperCaseElementNames = getConf().getBoolean(
+				"tika.uppercase.element.names", true);
 	}
 
 	public Configuration getConf() {