You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/12/11 12:40:40 UTC
svn commit: r1644604 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Author: jnioche
Date: Thu Dec 11 11:40:40 2014
New Revision: 1644604
URL: http://svn.apache.org/r1644604
Log:
NUTCH-1592 TikaParser can uppercase the element names while generating the DOM
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 11 11:40:40 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1592 TikaParser can uppercase the element names while generating the DOM (jnioche)
+
* NUTCH-1877 Suffix URL filter to ignore query string by default (markus via snagel)
* NUTCH-1890 Major Typo in Documentation for Integrating Nutch and Solr (Boadu Akoto Charles Jnr, mattmann)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Dec 11 11:40:40 2014
@@ -1200,6 +1200,15 @@
</property>
-->
+<property>
+ <name>tika.uppercase.element.names</name>
+ <value>true</value>
+ <description>Determines whether TikaParser should uppercase the element name while generating the DOM
+ for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592).
+ </description>
+</property>
+
+
<!-- urlfilter plugin properties -->
<property>
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Thu Dec 11 11:40:40 2014
@@ -47,6 +47,7 @@ import org.xml.sax.ext.LexicalHandler;
class DOMBuilder
implements ContentHandler, LexicalHandler
{
+ private boolean upperCaseElementNames = true;
/** Root document */
public Document m_doc;
@@ -265,7 +266,10 @@ class DOMBuilder
{
Element elem;
-
+
+ if (upperCaseElementNames)
+ name = name.toUpperCase();
+
// Note that the namespace-aware call must be used to correctly
// construct a Level 2 DOM, even for non-namespaced nodes.
if ((null == ns) || (ns.length() == 0))
@@ -737,4 +741,12 @@ class DOMBuilder
* parameter entity, the name will begin with '%'.
*/
public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+
+ public boolean isUpperCaseElementNames() {
+ return upperCaseElementNames;
+ }
+
+ public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+ this.upperCaseElementNames = upperCaseElementNames;
+ }
}
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Thu Dec 11 11:40:40 2014
@@ -61,6 +61,7 @@ public class TikaParser implements org.a
private HtmlParseFilters htmlParseFilters;
private String cachingPolicy;
private HtmlMapper HTMLMapper;
+ private boolean upperCaseElementNames = true;
@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
@@ -95,6 +96,7 @@ public class TikaParser implements org.a
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
+ domhandler.setUpperCaseElementNames(upperCaseElementNames);
ParseContext context = new ParseContext();
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);
@@ -242,7 +244,8 @@ public class TikaParser implements org.a
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
Nutch.CACHING_FORBIDDEN_CONTENT);
-
+ this.upperCaseElementNames = getConf().getBoolean(
+ "tika.uppercase.element.names", true);
}
public Configuration getConf() {