You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/11/07 10:59:07 UTC
svn commit: r1637325 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Author: jnioche
Date: Fri Nov 7 09:59:06 2014
New Revision: 1637325
URL: http://svn.apache.org/r1637325
Log:
NUTCH-1887 Specify HTMLMapper to use in TikaParser
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Nov 7 09:59:06 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1887 Specify HTMLMapper to use in TikaParser (jnioche)
+
* NUTCH-1884 NullPointerException in parsechecker and indexchecker with symlinks in file URL (Mengying Wang, snagel)
* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Nov 7 09:59:06 2014
@@ -1190,6 +1190,16 @@
</description>
</property>
+<!--
+<property>
+ <name>tika.htmlmapper.classname</name>
+ <value>org.apache.tika.parser.html.IdentityHtmlMapper</value>
+ <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence
+ the behaviour of the HTMLParseFilters.
+ </description>
+</property>
+-->
+
<!-- urlfilter plugin properties -->
<property>
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Fri Nov 7 09:59:06 2014
@@ -22,6 +22,7 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.Map;
+import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Nutch;
@@ -40,6 +41,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
@@ -58,9 +60,10 @@ public class TikaParser implements org.a
private DOMContentUtils utils;
private HtmlParseFilters htmlParseFilters;
private String cachingPolicy;
+ private HtmlMapper HTMLMapper;
@SuppressWarnings("deprecation")
- public ParseResult getParse(Content content) {
+ public ParseResult getParse(Content content) {
String mimeType = content.getContentType();
URL base;
@@ -93,11 +96,14 @@ public class TikaParser implements org.a
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
ParseContext context = new ParseContext();
+ if (HTMLMapper != null)
+ context.set(HtmlMapper.class, HTMLMapper);
tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
- parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
+ parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,
+ context);
} catch (Exception e) {
- LOG.error("Error parsing "+content.getUrl(),e);
+ LOG.error("Error parsing " + content.getUrl(), e);
return new ParseStatus(ParseStatus.FAILED, e.getMessage())
.getEmptyParseResult(content.getUrl(), getConf());
}
@@ -168,18 +174,18 @@ public class TikaParser implements org.a
status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
Integer.toString(metaTags.getRefreshTime()) });
}
- ParseData parseData = new ParseData(status, title, outlinks, content
- .getMetadata(), nutchMetadata);
- ParseResult parseResult = ParseResult.createParseResult(content
- .getUrl(), new ParseImpl(text, parseData));
+ ParseData parseData = new ParseData(status, title, outlinks,
+ content.getMetadata(), nutchMetadata);
+ ParseResult parseResult = ParseResult.createParseResult(
+ content.getUrl(), new ParseImpl(text, parseData));
// run filters on parse
ParseResult filteredParse = this.htmlParseFilters.filter(content,
parseResult, metaTags, root);
if (metaTags.getNoCache()) { // not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
- entry.getValue().getData().getParseMeta().set(
- Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+ entry.getValue().getData().getParseMeta()
+ .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
}
return filteredParse;
}
@@ -189,7 +195,7 @@ public class TikaParser implements org.a
this.tikaConfig = null;
// do we want a custom Tika configuration file
- // deprecated since Tika 0.7 which is based on
+ // deprecated since Tika 0.7 which is based on
// a service provider based configuration
String customConfFile = conf.get("tika.config.file");
if (customConfFile != null) {
@@ -212,6 +218,26 @@ public class TikaParser implements org.a
}
}
+ // use a custom htmlmapper
+ String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+ if (StringUtils.isNotBlank(htmlmapperClassName)) {
+ try {
+ Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+ boolean interfaceOK = HtmlMapper.class
+ .isAssignableFrom(HTMLMapperClass);
+ if (!interfaceOK) {
+ throw new RuntimeException("Class " + htmlmapperClassName
+ + " does not implement HtmlMapper");
+ }
+ HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+ } catch (Exception e) {
+ LOG.error("Can't generate instance for class "
+ + htmlmapperClassName);
+ throw new RuntimeException("Can't generate instance for class "
+ + htmlmapperClassName);
+ }
+ }
+
this.htmlParseFilters = new HtmlParseFilters(getConf());
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",