You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/11/07 10:59:07 UTC

svn commit: r1637325 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Author: jnioche
Date: Fri Nov  7 09:59:06 2014
New Revision: 1637325

URL: http://svn.apache.org/r1637325
Log:
NUTCH-1887 Specify HTMLMapper to use in TikaParser

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Nov  7 09:59:06 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1887 Specify HTMLMapper to use in TikaParser (jnioche)
+
 * NUTCH-1884 NullPointerException in parsechecker and indexchecker with symlinks in file URL (Mengying Wang, snagel)
 
 * NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Nov  7 09:59:06 2014
@@ -1190,6 +1190,16 @@
   </description>
 </property>
 
+<!--
+<property>
+  <name>tika.htmlmapper.classname</name>
+  <value>org.apache.tika.parser.html.IdentityHtmlMapper</value>
+  <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence
+  the behaviour of the HTMLParseFilters.
+  </description>
+</property>
+-->
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Fri Nov  7 09:59:06 2014
@@ -22,6 +22,7 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.Map;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Nutch;
@@ -40,6 +41,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
@@ -58,9 +60,10 @@ public class TikaParser implements org.a
 	private DOMContentUtils utils;
 	private HtmlParseFilters htmlParseFilters;
 	private String cachingPolicy;
+	private HtmlMapper HTMLMapper;
 
 	@SuppressWarnings("deprecation")
-  public ParseResult getParse(Content content) {
+	public ParseResult getParse(Content content) {
 		String mimeType = content.getContentType();
 
 		URL base;
@@ -93,11 +96,14 @@ public class TikaParser implements org.a
 		DocumentFragment root = doc.createDocumentFragment();
 		DOMBuilder domhandler = new DOMBuilder(doc, root);
 		ParseContext context = new ParseContext();
+		if (HTMLMapper != null)
+			context.set(HtmlMapper.class, HTMLMapper);
 		tikamd.set(Metadata.CONTENT_TYPE, mimeType);
 		try {
-		  parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
+			parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,
+					context);
 		} catch (Exception e) {
-			LOG.error("Error parsing "+content.getUrl(),e);
+			LOG.error("Error parsing " + content.getUrl(), e);
 			return new ParseStatus(ParseStatus.FAILED, e.getMessage())
 					.getEmptyParseResult(content.getUrl(), getConf());
 		}
@@ -168,18 +174,18 @@ public class TikaParser implements org.a
 			status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
 					Integer.toString(metaTags.getRefreshTime()) });
 		}
-		ParseData parseData = new ParseData(status, title, outlinks, content
-				.getMetadata(), nutchMetadata);
-		ParseResult parseResult = ParseResult.createParseResult(content
-				.getUrl(), new ParseImpl(text, parseData));
+		ParseData parseData = new ParseData(status, title, outlinks,
+				content.getMetadata(), nutchMetadata);
+		ParseResult parseResult = ParseResult.createParseResult(
+				content.getUrl(), new ParseImpl(text, parseData));
 
 		// run filters on parse
 		ParseResult filteredParse = this.htmlParseFilters.filter(content,
 				parseResult, metaTags, root);
 		if (metaTags.getNoCache()) { // not okay to cache
 			for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
-				entry.getValue().getData().getParseMeta().set(
-						Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+				entry.getValue().getData().getParseMeta()
+						.set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
 		}
 		return filteredParse;
 	}
@@ -189,7 +195,7 @@ public class TikaParser implements org.a
 		this.tikaConfig = null;
 
 		// do we want a custom Tika configuration file
-		// deprecated since Tika 0.7 which is based on 
+		// deprecated since Tika 0.7 which is based on
 		// a service provider based configuration
 		String customConfFile = conf.get("tika.config.file");
 		if (customConfFile != null) {
@@ -212,6 +218,26 @@ public class TikaParser implements org.a
 			}
 		}
 
+		// use a custom htmlmapper
+		String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+		if (StringUtils.isNotBlank(htmlmapperClassName)) {
+			try {
+				Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+				boolean interfaceOK = HtmlMapper.class
+						.isAssignableFrom(HTMLMapperClass);
+				if (!interfaceOK) {
+					throw new RuntimeException("Class " + htmlmapperClassName
+							+ " does not implement HtmlMapper");
+				}
+				HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+			} catch (Exception e) {
+				LOG.error("Can't generate instance for class "
+						+ htmlmapperClassName);
+				throw new RuntimeException("Can't generate instance for class "
+						+ htmlmapperClassName);
+			}
+		}
+
 		this.htmlParseFilters = new HtmlParseFilters(getConf());
 		this.utils = new DOMContentUtils(conf);
 		this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",