You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/08/26 15:05:11 UTC

[nutch] branch master updated: NUTCH-2726 Upgrade to Tika 1.22 - fall back to default Tika config if custom config file is not found - warn if loading a parser fails (reports potential plugin class loader issues - improve ant build file to download plugin dependencies (src/plugin/parse-tika/build-ivy.xml) - complete exclusions of dependencies provided also in Nutch core - force same version of xml-apis to be used by tika-core and tika-parsers: otherwise Tika parsers may fail with a linkage error because different [...]

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 09b7142  NUTCH-2726 Upgrade to Tika 1.22 - fall back to default Tika config if custom config file is not found - warn if loading a parser fails (reports potential plugin class loader issues - improve ant build file to download plugin dependencies   (src/plugin/parse-tika/build-ivy.xml) - complete exclusions of dependencies provided also in Nutch core - force same version of xml-apis to be used by tika-core and tika-parsers:   otherwise Tika parsers may fail with a linkage error b [...]
     new da2b51b  Merge pull request #459 from sebastian-nagel/NUTCH-2726-tika-1.22
09b7142 is described below

commit 09b7142fd41893ecfd6b4e466be5077430b0a709
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Aug 8 13:34:43 2019 +0200

    NUTCH-2726 Upgrade to Tika 1.22
    - fall back to default Tika config if custom config file is not found
    - warn if loading a parser fails (reports potential plugin class loader issues
    - improve ant build file to download plugin dependencies
      (src/plugin/parse-tika/build-ivy.xml)
    - complete exclusions of dependencies provided also in Nutch core
    - force same version of xml-apis to be used by tika-core and tika-parsers:
      otherwise Tika parsers may fail with a linkage error because different
      implementations of org.xml.sax.ContentHandler are used
---
 conf/tika-config.xml.template                      |   2 +-
 ivy/ivy.xml                                        |  16 ++--
 src/plugin/parse-tika/build-ivy.xml                |  11 ++-
 src/plugin/parse-tika/howto_upgrade_tika.txt       |   2 +-
 src/plugin/parse-tika/ivy.xml                      |   7 +-
 src/plugin/parse-tika/plugin.xml                   | 106 +++++++++++----------
 .../org/apache/nutch/parse/tika/TikaParser.java    |   6 +-
 7 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/conf/tika-config.xml.template b/conf/tika-config.xml.template
index 30af37d..571a606 100644
--- a/conf/tika-config.xml.template
+++ b/conf/tika-config.xml.template
@@ -16,5 +16,5 @@
   limitations under the License.
 -->
 <properties>
-    <service-loader initializableProblemHandler="ignore"/>
+    <service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" />
 </properties>
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 43bfdd4..a50441f 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -65,11 +65,12 @@
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/>
 		<!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.20" />
-		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
+		<dependency org="org.apache.tika" name="tika-core" rev="1.22" />
+
+		<dependency org="xml-apis" name="xml-apis" rev="1.4.01"/><!-- force this version as it is required by Tika -->
+		<dependency org="xerces" name="xercesImpl" rev="2.12.0" />
 
-		<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
-		<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
+		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
 
 		<dependency org="com.google.guava" name="guava" rev="25.0-jre" />
 
@@ -83,9 +84,10 @@
 		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/>
 		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/>
 		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/>
-		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.7" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.7" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.9" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.9.9" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.9" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.9" conf="*->default"/>
 
 		<!-- WARC artifacts needed -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
diff --git a/src/plugin/parse-tika/build-ivy.xml b/src/plugin/parse-tika/build-ivy.xml
index e4984d8..a8a0fe9 100644
--- a/src/plugin/parse-tika/build-ivy.xml
+++ b/src/plugin/parse-tika/build-ivy.xml
@@ -17,14 +17,21 @@
 -->
 <project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
 
-    <property name="ivy.install.version" value="2.1.0" />
+    <property name="ivy.install.version" value="2.4.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">
       <isset property="env.IVY_HOME" />
     </condition>
     <property name="ivy.home" value="${user.home}/.ant" />
     <property name="ivy.checksums" value="" />
     <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy-${ivy.install.version}.jar" />
+    <!-- define packaging.type=jar to work around the failing dependency download of
+           javax.ws.rs-api.jar
+         required by Tika (1.19 and higher), cf.
+           https://github.com/eclipse-ee4j/jaxrs-api/issues/572
+           https://github.com/jax-rs/api/pull/576
+    -->
+    <property name="packaging.type" value="jar"/>
 
     <target name="download-ivy" unless="offline">
 
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index fbf7207..aa4147c 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -1,4 +1,4 @@
-1. Upgrade Tika depencency (tika-core) in ivy/ivy.xml
+1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml
 
 2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
 
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index df06f14..08d0f12 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,8 +36,8 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default">
-      <!-- exclusions of dependencies in Nutch core (ivy/ivy.xml) -->
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.22" conf="*->default">
+      <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.httpcomponents" name="httpclient" />
       <exclude org="org.apache.httpcomponents" name="httpcore" />
@@ -50,10 +50,13 @@
       <exclude org="org.apache.cxf" name="cxf-core" />
       <exclude org="org.apache.cxf" name="cxf-rt-transports-http" />
       <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-core" />
       <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" />
       <exclude org="com.google.protobuf" name="protobuf-java" />
       <exclude org="org.slf4j" name="slf4j-log4j12" />
       <exclude org="org.slf4j" name="slf4j-api" />
+      <exclude org="xml-apis" name="xml-apis" /><!-- must be provided in core as it is used also by tika-core -->
     </dependency>
   </dependencies>
   
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index b89f41e..18dad6c 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,83 +26,91 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika (tika-parsers) -->
-      <library name="apache-mime4j-core-0.8.2.jar"/>
-      <library name="apache-mime4j-dom-0.8.2.jar"/>
-      <library name="asm-7.0.jar"/>
-      <library name="bcmail-jdk15on-1.60.jar"/>
-      <library name="bcpkix-jdk15on-1.60.jar"/>
-      <library name="bcprov-jdk15on-1.60.jar"/>
+      <library name="animal-sniffer-annotations-1.17.jar"/>
+      <library name="ant-1.10.5.jar"/>
+      <library name="ant-launcher-1.10.5.jar"/>
+      <library name="apache-mime4j-core-0.8.3.jar"/>
+      <library name="apache-mime4j-dom-0.8.3.jar"/>
+      <library name="asm-7.2-beta.jar"/>
+      <library name="bcmail-jdk15on-1.62.jar"/>
+      <library name="bcpkix-jdk15on-1.62.jar"/>
+      <library name="bcprov-jdk15on-1.62.jar"/>
       <library name="boilerpipe-1.1.0.jar"/>
       <library name="bzip2-0.9.1.jar"/>
-      <library name="c3p0-0.9.1.1.jar"/>
+      <library name="c3p0-0.9.5.4.jar"/>
       <library name="cdm-4.5.5.jar"/>
-      <library name="commons-collections4-4.2.jar"/>
-      <library name="commons-compress-1.18.jar"/>
-      <library name="commons-csv-1.6.jar"/>
+      <library name="checker-qual-2.8.1.jar"/>
+      <library name="codemodel-2.3.2.jar"/>
+      <library name="commons-csv-1.7.jar"/>
       <library name="commons-exec-1.3.jar"/>
       <library name="commons-io-2.6.jar"/>
-      <library name="commons-lang3-3.8.1.jar"/>
+      <library name="commons-logging-1.2.jar"/>
       <library name="commons-math3-3.6.1.jar"/>
       <library name="curvesapi-1.05.jar"/>
-      <library name="cxf-core-3.2.7.jar"/>
-      <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/>
-      <library name="cxf-rt-rs-client-3.2.7.jar"/>
-      <library name="cxf-rt-transports-http-3.2.7.jar"/>
+      <library name="cxf-rt-rs-client-3.3.2.jar"/>
+      <library name="cxf-rt-security-3.3.2.jar"/>
       <library name="dec-0.1.2.jar"/>
+      <library name="dtd-parser-1.4.1.jar"/>
       <library name="ehcache-core-2.6.2.jar"/>
-      <library name="FastInfoset-1.2.15.jar"/>
-      <library name="fontbox-2.0.13.jar"/>
+      <library name="error_prone_annotations-2.3.2.jar"/>
+      <library name="failureaccess-1.0.1.jar"/>
+      <library name="FastInfoset-1.2.16.jar"/>
+      <library name="fontbox-2.0.16.jar"/>
       <library name="geoapi-3.0.1.jar"/>
       <library name="grib-4.5.5.jar"/>
       <library name="gson-2.8.5.jar"/>
-      <library name="guava-17.0.jar"/>
-      <library name="httpmime-4.5.6.jar"/>
+      <library name="guava-28.0-jre.jar"/>
+      <library name="httpmime-4.5.9.jar"/>
       <library name="httpservices-4.5.5.jar"/>
       <library name="isoparser-1.1.22.jar"/>
-      <library name="istack-commons-runtime-3.0.7.jar"/>
-      <library name="jackcess-2.1.12.jar"/>
-      <library name="jackcess-encrypt-2.1.4.jar"/>
-      <library name="jackson-annotations-2.9.7.jar"/>
-      <library name="jackson-core-2.9.7.jar"/>
-      <library name="jackson-databind-2.9.7.jar"/>
+      <library name="istack-commons-runtime-3.0.8.jar"/>
+      <library name="istack-commons-tools-3.0.8.jar"/>
+      <library name="j2objc-annotations-1.3.jar"/>
+      <library name="jackcess-3.0.1.jar"/>
+      <library name="jackcess-encrypt-3.0.0.jar"/>
       <library name="jai-imageio-core-1.4.0.jar"/>
+      <library name="jakarta.activation-1.2.1.jar"/>
+      <library name="jakarta.activation-api-1.2.1.jar"/>
+      <library name="jakarta.ws.rs-api-2.1.5.jar"/>
+      <library name="jakarta.xml.bind-api-2.3.2.jar"/>
       <library name="java-libpst-0.8.1.jar"/>
-      <library name="javax.activation-1.2.0.jar"/>
       <library name="javax.annotation-api-1.3.2.jar"/>
-      <library name="javax.ws.rs-api-2.1.1.jar"/>
-      <library name="jaxb-api-2.3.1.jar"/>
-      <library name="jaxb-runtime-2.3.1.jar"/>
+      <library name="jaxb-runtime-2.3.2.jar"/>
+      <library name="jaxb-xjc-2.3.2.jar"/>
       <library name="jbig2-imageio-3.0.2.jar"/>
       <library name="jcip-annotations-1.0.jar"/>
-      <library name="jcl-over-slf4j-1.7.25.jar"/>
+      <library name="jcl-over-slf4j-1.7.26.jar"/>
       <library name="jcommander-1.35.jar"/>
       <library name="jdom2-2.0.6.jar"/>
       <library name="jempbox-1.8.16.jar"/>
       <library name="jhighlight-1.0.3.jar"/>
       <library name="jmatio-1.5.jar"/>
-      <library name="jna-5.1.0.jar"/>
+      <library name="jna-5.3.1.jar"/>
       <library name="joda-time-2.2.jar"/>
       <library name="json-simple-1.1.1.jar"/>
-      <library name="jsoup-1.11.3.jar"/>
-      <library name="jul-to-slf4j-1.7.25.jar"/>
+      <library name="jsoup-1.12.1.jar"/>
+      <library name="jsr305-3.0.2.jar"/>
+      <library name="jul-to-slf4j-1.7.26.jar"/>
       <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-2.0.0.jar"/>
+      <library name="junrar-4.0.0.jar"/>
+      <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
+      <library name="mchange-commons-java-0.2.15.jar"/>
       <library name="metadata-extractor-2.11.0.jar"/>
       <library name="netcdf4-4.5.5.jar"/>
-      <library name="openjson-1.0.10.jar"/>
-      <library name="opennlp-tools-1.9.0.jar"/>
-      <library name="parso-2.0.10.jar"/>
-      <library name="pdfbox-2.0.13.jar"/>
-      <library name="pdfbox-tools-2.0.13.jar"/>
+      <library name="openjson-1.0.11.jar"/>
+      <library name="opennlp-tools-1.9.1.jar"/>
+      <library name="parso-2.0.11.jar"/>
+      <library name="pdfbox-2.0.16.jar"/>
+      <library name="pdfbox-tools-2.0.16.jar"/>
       <library name="poi-4.0.1.jar"/>
       <library name="poi-ooxml-4.0.1.jar"/>
       <library name="poi-ooxml-schemas-4.0.1.jar"/>
       <library name="poi-scratchpad-4.0.1.jar"/>
-      <library name="procyon-compilertools-0.5.32.jar"/>
-      <library name="procyon-core-0.5.32.jar"/>
       <library name="quartz-2.2.0.jar"/>
-      <library name="rome-1.12.0.jar"/>
-      <library name="rome-utils-1.12.0.jar"/>
+      <library name="relaxng-datatype-2.3.2.jar"/>
+      <library name="rngom-2.3.2.jar"/>
+      <library name="rome-1.12.1.jar"/>
+      <library name="rome-utils-1.12.1.jar"/>
       <library name="sentiment-analysis-parser-0.1.jar"/>
       <library name="sis-feature-0.8.jar"/>
       <library name="sis-metadata-0.8.jar"/>
@@ -111,20 +119,20 @@
       <library name="sis-storage-0.8.jar"/>
       <library name="sis-utility-0.8.jar"/>
       <library name="stax2-api-3.1.4.jar"/>
-      <library name="stax-ex-1.8.jar"/>
+      <library name="stax-ex-1.8.1.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-1.20.jar"/>
-      <library name="txw2-2.3.1.jar"/>
+      <library name="tika-parsers-1.22.jar"/>
+      <library name="txw2-2.3.2.jar"/>
       <library name="udunits-4.5.5.jar"/>
-      <library name="uimafit-core-2.4.0.jar"/>
-      <library name="uimaj-core-3.0.1.jar"/>
       <library name="unit-api-1.0.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
       <library name="woodstox-core-5.0.3.jar"/>
+      <library name="xercesImpl-2.12.0.jar"/>
       <library name="xmlbeans-3.0.2.jar"/>
-      <library name="xmlschema-core-2.2.3.jar"/>
+      <library name="xmlschema-core-2.2.4.jar"/>
       <library name="xmpcore-5.1.3.jar"/>
+      <library name="xsom-2.3.2.jar"/>
       <library name="xz-1.8.jar"/>
       <!-- end of dependencies of Tika (tika-parsers) -->
    </runtime>
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 40aa265..3a48c98 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -257,15 +257,17 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
       try {
         // see if a Tika config file can be found in the job file
         URL customTikaConfig = conf.getResource(customConfFile);
-        if (customTikaConfig != null)
+        if (customTikaConfig != null) {
           tikaConfig = new TikaConfig(customTikaConfig,
               this.getClass().getClassLoader());
+        }
       } catch (Exception e1) {
         String message = "Problem loading custom Tika configuration from "
             + customConfFile;
         LOG.error(message, e1);
       }
-    } else {
+    }
+    if (tikaConfig == null) {
       try {
         tikaConfig = new TikaConfig(this.getClass().getClassLoader());
       } catch (Exception e2) {