You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/01/21 15:36:29 UTC
[nutch] branch master updated: NUTCH-2682 Upgrade to Tika 1.20 -
upgrade to Tika dependencies to version 1.20 - plugin parse-tika: add
exclusions of transitive dependencies already provided as Nutch core
dependencies - upgrade Nutch core dependencies to match versions required
by Tika 1.20 - apply code formatting template to TikaParser class and
replace deprecated method calls
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 784aa5f NUTCH-2682 Upgrade to Tika 1.20 - upgrade to Tika dependencies to version 1.20 - plugin parse-tika: add exclusions of transitive dependencies already provided as Nutch core dependencies - upgrade Nutch core dependencies to match versions required by Tika 1.20 - apply code formatting template to TikaParser class and replace deprecated method calls
new 6934d52 Merge pull request #424 from sebastian-nagel/NUTCH-2682-upgrade-tika
784aa5f is described below
commit 784aa5f8a5210cdd129a583c1dccdffaad5f9807
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Fri Jan 4 17:40:17 2019 +0100
NUTCH-2682 Upgrade to Tika 1.20
- upgrade to Tika dependencies to version 1.20
- plugin parse-tika: add exclusions of transitive dependencies
already provided as Nutch core dependencies
- upgrade Nutch core dependencies to match versions required
by Tika 1.20
- apply code formatting template to TikaParser class and replace
deprecated method calls
---
ivy/ivy.xml | 26 +++----
src/plugin/parse-tika/howto_upgrade_tika.txt | 19 ++++++
src/plugin/parse-tika/ivy.xml | 16 ++++-
src/plugin/parse-tika/plugin.xml | 79 +++++++++++-----------
.../org/apache/nutch/parse/tika/TikaParser.java | 78 +++++++++++----------
5 files changed, 129 insertions(+), 89 deletions(-)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index f1e4a80..52826bb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -43,11 +43,11 @@
<exclude org="com.sun.jmx" name="jmxri" />
</dependency-->
- <dependency org="org.apache.commons" name="commons-lang3" rev="3.7" conf="*->default" />
- <dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->master" />
- <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5" conf="*->master" />
+ <dependency org="org.apache.commons" name="commons-lang3" rev="3.8.1" conf="*->default" />
+ <dependency org="org.apache.commons" name="commons-collections4" rev="4.2" conf="*->master" />
+ <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.6" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.11" conf="*->default" />
- <dependency org="org.apache.commons" name="commons-compress" rev="1.16.1" conf="*->default" />
+ <dependency org="org.apache.commons" name="commons-compress" rev="1.18" conf="*->default" />
<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
<dependency org="com.tdunning" name="t-digest" rev="3.2" />
@@ -65,7 +65,7 @@
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/>
<!-- End of Hadoop Dependencies -->
- <dependency org="org.apache.tika" name="tika-core" rev="1.19.1" />
+ <dependency org="org.apache.tika" name="tika-core" rev="1.20" />
<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
@@ -78,14 +78,14 @@
<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
<!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
- <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.1.15" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.1.15" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.1.15" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.1.15" conf="*->default"/>
- <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.1.15" conf="test->default"/>
- <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.5" conf="*->default"/>
- <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.5" conf="*->default"/>
- <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.5" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.2.7" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/>
+ <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7" conf="*->default"/>
+ <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.7" conf="*->default"/>
+ <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.7" conf="*->default"/>
<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index f8bbae1..fbf7207 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -15,4 +15,23 @@
<!-- end of dependencies of Tika (tika-parsers) -->
with the output of the command above.
+4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies:
+ - check for libs present both in
+ build/lib
+ and
+ build/plugins/parse-tika/
+ (eventually with different versions)
+ - duplicated libs can be added to the exclusions of transitive dependencies in
+ build/plugins/parse-tika/ivy.xml
+ - but it should be made sure that the library versions in ivy/ivy.xml correspend to
+ those required by Tika
+
+5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
+
+ $ rm -rf lib/
+
+6. Build Nutch and run all unit tests:
+
+ $ cd ../../../
+ $ ant clean runtime test
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 53c7775..df06f14 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,14 +36,24 @@
</publications>
<dependencies>
- <dependency org="org.apache.tika" name="tika-parsers" rev="1.19.1" conf="*->default">
+ <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default">
+ <!-- exclusions of dependencies in Nutch core (ivy/ivy.xml) -->
<exclude org="org.apache.tika" name="tika-core" />
<exclude org="org.apache.httpcomponents" name="httpclient" />
<exclude org="org.apache.httpcomponents" name="httpcore" />
- <exclude org="org.slf4j" name="slf4j-log4j12" />
- <exclude org="org.slf4j" name="slf4j-api" />
<exclude org="commons-lang" name="commons-lang" />
+ <exclude org="org.apache.commons" name="commons-lang3" />
+ <exclude org="org.apache.commons" name="commons-codec" />
+ <exclude org="commons-codec" name="commons-codec" /><!-- older versions are published with org=commons-codec -->
+ <exclude org="org.apache.commons" name="commons-collections4" />
+ <exclude org="org.apache.commons" name="commons-compress" />
+ <exclude org="org.apache.cxf" name="cxf-core" />
+ <exclude org="org.apache.cxf" name="cxf-rt-transports-http" />
+ <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" />
+ <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
<exclude org="com.google.protobuf" name="protobuf-java" />
+ <exclude org="org.slf4j" name="slf4j-log4j12" />
+ <exclude org="org.slf4j" name="slf4j-api" />
</dependency>
</dependencies>
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 7dbe180..b89f41e 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,10 +26,9 @@
<export name="*"/>
</library>
<!-- dependencies of Tika (tika-parsers) -->
- <library name="activation-1.1.1.jar"/>
<library name="apache-mime4j-core-0.8.2.jar"/>
<library name="apache-mime4j-dom-0.8.2.jar"/>
- <library name="asm-6.2.jar"/>
+ <library name="asm-7.0.jar"/>
<library name="bcmail-jdk15on-1.60.jar"/>
<library name="bcpkix-jdk15on-1.60.jar"/>
<library name="bcprov-jdk15on-1.60.jar"/>
@@ -37,22 +36,22 @@
<library name="bzip2-0.9.1.jar"/>
<library name="c3p0-0.9.1.1.jar"/>
<library name="cdm-4.5.5.jar"/>
- <library name="commons-codec-1.11.jar"/>
<library name="commons-collections4-4.2.jar"/>
<library name="commons-compress-1.18.jar"/>
- <library name="commons-csv-1.5.jar"/>
+ <library name="commons-csv-1.6.jar"/>
<library name="commons-exec-1.3.jar"/>
<library name="commons-io-2.6.jar"/>
- <library name="commons-logging-1.2.jar"/>
- <library name="curvesapi-1.04.jar"/>
- <library name="cxf-core-3.2.6.jar"/>
- <library name="cxf-rt-frontend-jaxrs-3.2.6.jar"/>
- <library name="cxf-rt-rs-client-3.2.6.jar"/>
- <library name="cxf-rt-transports-http-3.2.6.jar"/>
+ <library name="commons-lang3-3.8.1.jar"/>
+ <library name="commons-math3-3.6.1.jar"/>
+ <library name="curvesapi-1.05.jar"/>
+ <library name="cxf-core-3.2.7.jar"/>
+ <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/>
+ <library name="cxf-rt-rs-client-3.2.7.jar"/>
+ <library name="cxf-rt-transports-http-3.2.7.jar"/>
<library name="dec-0.1.2.jar"/>
<library name="ehcache-core-2.6.2.jar"/>
- <library name="FastInfoset-1.2.13.jar"/>
- <library name="fontbox-2.0.12.jar"/>
+ <library name="FastInfoset-1.2.15.jar"/>
+ <library name="fontbox-2.0.13.jar"/>
<library name="geoapi-3.0.1.jar"/>
<library name="grib-4.5.5.jar"/>
<library name="gson-2.8.5.jar"/>
@@ -60,19 +59,19 @@
<library name="httpmime-4.5.6.jar"/>
<library name="httpservices-4.5.5.jar"/>
<library name="isoparser-1.1.22.jar"/>
- <library name="istack-commons-runtime-3.0.5.jar"/>
+ <library name="istack-commons-runtime-3.0.7.jar"/>
<library name="jackcess-2.1.12.jar"/>
<library name="jackcess-encrypt-2.1.4.jar"/>
- <library name="jackson-annotations-2.9.6.jar"/>
- <library name="jackson-core-2.9.6.jar"/>
- <library name="jackson-databind-2.9.6.jar"/>
+ <library name="jackson-annotations-2.9.7.jar"/>
+ <library name="jackson-core-2.9.7.jar"/>
+ <library name="jackson-databind-2.9.7.jar"/>
<library name="jai-imageio-core-1.4.0.jar"/>
<library name="java-libpst-0.8.1.jar"/>
- <library name="javax.annotation-api-1.3.jar"/>
- <library name="javax.ws.rs-api-2.1.jar"/>
- <library name="jaxb-api-2.3.0.jar"/>
- <library name="jaxb-core-2.3.0.1.jar"/>
- <library name="jaxb-runtime-2.3.0.1.jar"/>
+ <library name="javax.activation-1.2.0.jar"/>
+ <library name="javax.annotation-api-1.3.2.jar"/>
+ <library name="javax.ws.rs-api-2.1.1.jar"/>
+ <library name="jaxb-api-2.3.1.jar"/>
+ <library name="jaxb-runtime-2.3.1.jar"/>
<library name="jbig2-imageio-3.0.2.jar"/>
<library name="jcip-annotations-1.0.jar"/>
<library name="jcl-over-slf4j-1.7.25.jar"/>
@@ -81,7 +80,7 @@
<library name="jempbox-1.8.16.jar"/>
<library name="jhighlight-1.0.3.jar"/>
<library name="jmatio-1.5.jar"/>
- <library name="jna-4.3.0.jar"/>
+ <library name="jna-5.1.0.jar"/>
<library name="joda-time-2.2.jar"/>
<library name="json-simple-1.1.1.jar"/>
<library name="jsoup-1.11.3.jar"/>
@@ -92,16 +91,18 @@
<library name="netcdf4-4.5.5.jar"/>
<library name="openjson-1.0.10.jar"/>
<library name="opennlp-tools-1.9.0.jar"/>
- <library name="parso-2.0.9.jar"/>
- <library name="pdfbox-2.0.12.jar"/>
- <library name="pdfbox-tools-2.0.12.jar"/>
- <library name="poi-4.0.0.jar"/>
- <library name="poi-ooxml-4.0.0.jar"/>
- <library name="poi-ooxml-schemas-4.0.0.jar"/>
- <library name="poi-scratchpad-4.0.0.jar"/>
+ <library name="parso-2.0.10.jar"/>
+ <library name="pdfbox-2.0.13.jar"/>
+ <library name="pdfbox-tools-2.0.13.jar"/>
+ <library name="poi-4.0.1.jar"/>
+ <library name="poi-ooxml-4.0.1.jar"/>
+ <library name="poi-ooxml-schemas-4.0.1.jar"/>
+ <library name="poi-scratchpad-4.0.1.jar"/>
+ <library name="procyon-compilertools-0.5.32.jar"/>
+ <library name="procyon-core-0.5.32.jar"/>
<library name="quartz-2.2.0.jar"/>
- <library name="rome-1.5.1.jar"/>
- <library name="rome-utils-1.5.1.jar"/>
+ <library name="rome-1.12.0.jar"/>
+ <library name="rome-utils-1.12.0.jar"/>
<library name="sentiment-analysis-parser-0.1.jar"/>
<library name="sis-feature-0.8.jar"/>
<library name="sis-metadata-0.8.jar"/>
@@ -109,19 +110,19 @@
<library name="sis-referencing-0.8.jar"/>
<library name="sis-storage-0.8.jar"/>
<library name="sis-utility-0.8.jar"/>
- <library name="stax2-api-4.1.jar"/>
- <library name="stax-ex-1.7.8.jar"/>
+ <library name="stax2-api-3.1.4.jar"/>
+ <library name="stax-ex-1.8.jar"/>
<library name="tagsoup-1.2.1.jar"/>
- <library name="tika-parsers-1.19.1.jar"/>
- <library name="txw2-2.3.0.1.jar"/>
+ <library name="tika-parsers-1.20.jar"/>
+ <library name="txw2-2.3.1.jar"/>
<library name="udunits-4.5.5.jar"/>
- <library name="uimafit-core-2.2.0.jar"/>
- <library name="uimaj-core-2.9.0.jar"/>
+ <library name="uimafit-core-2.4.0.jar"/>
+ <library name="uimaj-core-3.0.1.jar"/>
<library name="unit-api-1.0.jar"/>
<library name="vorbis-java-core-0.8.jar"/>
<library name="vorbis-java-tika-0.8.jar"/>
- <library name="woodstox-core-5.1.0.jar"/>
- <library name="xmlbeans-3.0.1.jar"/>
+ <library name="woodstox-core-5.0.3.jar"/>
+ <library name="xmlbeans-3.0.2.jar"/>
<library name="xmlschema-core-2.2.3.jar"/>
<library name="xmpcore-5.1.3.jar"/>
<library name="xz-1.8.jar"/>
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index e346940..7440333 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -42,6 +42,7 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
@@ -70,6 +71,8 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
private String cachingPolicy;
private HtmlMapper HTMLMapper;
private boolean upperCaseElementNames = true;
+ private String boilerpipeExtractorName;
+ private boolean useBoilerpipe;
public ParseResult getParse(Content content) {
HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -83,59 +86,59 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
ParseResult getParse(Content content, HTMLDocumentImpl doc,
DocumentFragment root) {
String mimeType = content.getContentType();
-
- boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
- String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
+ getConf());
}
// get the right parser using the mime type as a clue
- Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
- byte[] raw = content.getContent();
-
+ CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
+ Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
if (parser == null) {
String message = "Can't retrieve Tika parser for mime-type " + mimeType;
LOG.error(message);
- return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
- content.getUrl(), getConf());
+ return new ParseStatus(ParseStatus.FAILED, message)
+ .getEmptyParseResult(content.getUrl(), getConf());
}
- LOG.debug("Using Tika parser " + parser.getClass().getName()
- + " for mime-type " + mimeType);
+ LOG.debug("Using Tika parser {} for mime-type {}.",
+ parser.getClass().getName(), mimeType);
+ byte[] raw = content.getContent();
Metadata tikamd = new Metadata();
ContentHandler domHandler;
-
+
// Check whether to use Tika's BoilerplateContentHandler
if (useBoilerpipe) {
- BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
- BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
+ (ContentHandler) new DOMBuilder(doc, root),
+ BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
bpHandler.setIncludeMarkup(true);
- domHandler = (ContentHandler)bpHandler;
+ domHandler = (ContentHandler) bpHandler;
} else {
DOMBuilder domBuilder = new DOMBuilder(doc, root);
domBuilder.setUpperCaseElementNames(upperCaseElementNames);
domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
- domHandler = (ContentHandler)domBuilder;
+ domHandler = (ContentHandler) domBuilder;
}
LinkContentHandler linkContentHandler = new LinkContentHandler();
ParseContext context = new ParseContext();
- TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
-
+ TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler,
+ linkContentHandler);
+
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);
tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
- parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context);
+ parser.parse(new ByteArrayInputStream(raw),
+ (ContentHandler) teeContentHandler, tikamd, context);
} catch (Exception e) {
LOG.error("Error parsing " + content.getUrl(), e);
return new ParseStatus(ParseStatus.FAILED, e.getMessage())
@@ -186,16 +189,16 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links (base URL = {}) ...", baseTag);
}
-
+
// pre-1233 outlink extraction
- //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+ // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
// Get outlinks from Tika
List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
- LOG.trace("found " + outlinks.length + " outlinks in "
- + content.getUrl());
+ LOG.trace(
+ "found " + outlinks.length + " outlinks in " + content.getUrl());
}
}
@@ -251,7 +254,8 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
// see if a Tika config file can be found in the job file
URL customTikaConfig = conf.getResource(customConfFile);
if (customTikaConfig != null)
- tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
+ tikaConfig = new TikaConfig(customTikaConfig,
+ this.getClass().getClassLoader());
} catch (Exception e1) {
String message = "Problem loading custom Tika configuration from "
+ customConfFile;
@@ -277,20 +281,26 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
throw new RuntimeException("Class " + htmlmapperClassName
+ " does not implement HtmlMapper");
}
- HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor().newInstance();
+ HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor()
+ .newInstance();
} catch (Exception e) {
- LOG.error("Can't generate instance for class " + htmlmapperClassName);
- throw new RuntimeException("Can't generate instance for class "
- + htmlmapperClassName);
+ String message = "Can't generate instance for class "
+ + htmlmapperClassName;
+ LOG.error(message);
+ throw new RuntimeException(message);
}
}
- this.htmlParseFilters = new HtmlParseFilters(getConf());
- this.utils = new DOMContentUtils(conf);
- this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+ htmlParseFilters = new HtmlParseFilters(getConf());
+ utils = new DOMContentUtils(conf);
+ cachingPolicy = getConf().get("parser.caching.forbidden.policy",
Nutch.CACHING_FORBIDDEN_CONTENT);
- this.upperCaseElementNames = getConf().getBoolean(
- "tika.uppercase.element.names", true);
+ upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names",
+ true);
+ useBoilerpipe = getConf().get("tika.extractor", "none")
+ .equals("boilerpipe");
+ boilerpipeExtractorName = getConf()
+ .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
}
public Configuration getConf() {