You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2014/09/07 22:54:54 UTC

svn commit: r1623227 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/licenses/ solr/ solr/contrib/ solr/contrib/extraction/ solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ solr/licenses/

Author: uschindler
Date: Sun Sep  7 20:54:53 2014
New Revision: 1623227

URL: http://svn.apache.org/r1623227
Log:
Merged revision(s) 1623225 from lucene/dev/trunk:
SOLR-6488: Upgrade Solr Cell to TIKA 1.6

Added:
    lucene/dev/branches/branch_4x/lucene/licenses/commons-compress-1.8.1.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/lucene/licenses/commons-compress-1.8.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/aspectjrt-1.8.0.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/aspectjrt-1.8.0.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/commons-compress-1.8.1.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/commons-compress-1.8.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/fontbox-1.8.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/fontbox-1.8.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/isoparser-1.0.2.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/isoparser-1.0.2.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/java-libpst-0.8.1.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/java-libpst-0.8.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/java-libpst-LICENSE-ASL.txt
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/java-libpst-LICENSE-ASL.txt
    lucene/dev/branches/branch_4x/solr/licenses/java-libpst-NOTICE.txt
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/java-libpst-NOTICE.txt
    lucene/dev/branches/branch_4x/solr/licenses/jempbox-1.8.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/jempbox-1.8.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/jmatio-1.0.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/jmatio-1.0.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/jmatio-LICENSE-BSD.txt
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/jmatio-LICENSE-BSD.txt
    lucene/dev/branches/branch_4x/solr/licenses/jmatio-NOTICE.txt
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/jmatio-NOTICE.txt
    lucene/dev/branches/branch_4x/solr/licenses/pdfbox-1.8.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/pdfbox-1.8.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-3.11-beta2.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/poi-3.11-beta2.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-ooxml-3.11-beta2.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/poi-ooxml-3.11-beta2.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-ooxml-schemas-3.11-beta2.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/poi-ooxml-schemas-3.11-beta2.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-scratchpad-3.11-beta2.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/poi-scratchpad-3.11-beta2.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/rome-1.0.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/rome-1.0.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/tika-core-1.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/tika-core-1.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/tika-parsers-1.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/tika-parsers-1.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/tika-xmp-1.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/tika-xmp-1.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/vorbis-java-core-0.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/vorbis-java-core-0.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/vorbis-java-tika-0.6.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/vorbis-java-tika-0.6.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/xz-1.5.jar.sha1
      - copied unchanged from r1623225, lucene/dev/trunk/solr/licenses/xz-1.5.jar.sha1
Removed:
    lucene/dev/branches/branch_4x/lucene/licenses/commons-compress-1.7.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/aspectjrt-1.6.11.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/commons-compress-1.7.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/fontbox-1.8.4.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/isoparser-1.0-RC-1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/jempbox-1.8.4.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/netcdf-4.2-min.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/netcdf-LICENSE-MIT.txt
    lucene/dev/branches/branch_4x/solr/licenses/netcdf-NOTICE.txt
    lucene/dev/branches/branch_4x/solr/licenses/pdfbox-1.8.4.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-3.10.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-ooxml-3.10.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-ooxml-schemas-3.10.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/poi-scratchpad-3.10.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/rome-0.9.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/tika-core-1.5.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/tika-parsers-1.5.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/tika-xmp-1.5.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/vorbis-java-core-0.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/vorbis-java-tika-0.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/xz-1.4.jar.sha1
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/ivy-versions.properties   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/NOTICE.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/extraction/ivy.xml
    lucene/dev/branches/branch_4x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/ivy-versions.properties
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/ivy-versions.properties?rev=1623227&r1=1623226&r2=1623227&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/ivy-versions.properties (original)
+++ lucene/dev/branches/branch_4x/lucene/ivy-versions.properties Sun Sep  7 20:54:53 2014
@@ -34,8 +34,9 @@ com.google.inject.guice.version = 3.0
 /com.google.protobuf/protobuf-java = 2.5.0
 /com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
 /com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
-/com.googlecode.mp4parser/isoparser = 1.0-RC-1
+/com.googlecode.mp4parser/isoparser = 1.0.2
 /com.ibm.icu/icu4j = 53.1
+/com.pff/java-libpst = 0.8.1
 /com.spatial4j/spatial4j = 0.4.1
 
 com.sun.jersey.version = 1.9
@@ -64,7 +65,6 @@ com.sun.jersey.version = 1.9
 /commons-logging/commons-logging = 1.1.3
 /de.l3s.boilerpipe/boilerpipe = 1.1.0
 /dom4j/dom4j = 1.6.1
-/edu.ucar/netcdf = 4.2-min
 /hsqldb/hsqldb = 1.8.0.10
 /io.netty/netty = 3.7.0.Final
 /jakarta-regexp/jakarta-regexp = 1.4
@@ -81,11 +81,12 @@ com.sun.jersey.version = 1.9
 /net.arnx/jsonic = 1.2.7
 /net.sf.saxon/Saxon-HE = 9.5.1-4
 /net.sourceforge.argparse4j/argparse4j = 0.4.3
+/net.sourceforge.jmatio/jmatio = 1.0
 /net.sourceforge.nekohtml/nekohtml = 1.9.17
 /org.antlr/antlr-runtime = 3.5
 /org.apache.ant/ant = 1.8.2
 /org.apache.avro/avro = 1.7.5
-/org.apache.commons/commons-compress = 1.7
+/org.apache.commons/commons-compress = 1.8.1
 /org.apache.derby/derby = 10.9.1.0
 
 org.apache.hadoop.version = 2.2.0
@@ -125,18 +126,18 @@ org.apache.james.apache.mime4j.version =
 /org.apache.mahout/mahout-math = 0.6
 /org.apache.mrunit/mrunit = 1.0.0
 
-org.apache.pdfbox.version = 1.8.4
+org.apache.pdfbox.version = 1.8.6
 /org.apache.pdfbox/fontbox = ${org.apache.pdfbox.version}
 /org.apache.pdfbox/jempbox = ${org.apache.pdfbox.version}
 /org.apache.pdfbox/pdfbox = ${org.apache.pdfbox.version}
 
-org.apache.poi.version = 3.10.1
+org.apache.poi.version = 3.11-beta2
 /org.apache.poi/poi = ${org.apache.poi.version}
 /org.apache.poi/poi-ooxml = ${org.apache.poi.version}
 /org.apache.poi/poi-ooxml-schemas = ${org.apache.poi.version}
 /org.apache.poi/poi-scratchpad = ${org.apache.poi.version}
 
-org.apache.tika.version = 1.5
+org.apache.tika.version = 1.6
 /org.apache.tika/tika-core = ${org.apache.tika.version}
 /org.apache.tika/tika-parsers = ${org.apache.tika.version}
 /org.apache.tika/tika-xmp = ${org.apache.tika.version}
@@ -152,7 +153,7 @@ org.apache.uima.version = 2.3.1
 /org.apache.velocity/velocity-tools = 2.0
 /org.apache.xmlbeans/xmlbeans = 2.6.0
 /org.apache.zookeeper/zookeeper = 3.4.6
-/org.aspectj/aspectjrt = 1.6.11
+/org.aspectj/aspectjrt = 1.8.0
 
 org.bouncycastle.version = 1.45
 /org.bouncycastle/bcmail-jdk15 = ${org.bouncycastle.version}
@@ -191,7 +192,7 @@ org.eclipse.jetty.version = 8.1.10.v2013
 /org.eclipse.jetty/jetty-webapp = ${org.eclipse.jetty.version}
 /org.eclipse.jetty/jetty-xml = ${org.eclipse.jetty.version}
 
-org.gagravarr.vorbis.java.version = 0.1
+org.gagravarr.vorbis.java.version = 0.6
 /org.gagravarr/vorbis-java-core = ${org.gagravarr.vorbis.java.version}
 /org.gagravarr/vorbis-java-tika = ${org.gagravarr.vorbis.java.version}
 
@@ -230,7 +231,7 @@ org.slf4j.version = 1.7.6
 /org.slf4j/slf4j-api = ${org.slf4j.version}
 /org.slf4j/slf4j-log4j12 = ${org.slf4j.version}
 
-/org.tukaani/xz = 1.4
+/org.tukaani/xz = 1.5
 /org.xerial.snappy/snappy-java = 1.0.5
-/rome/rome = 0.9
+/rome/rome = 1.0
 /xerces/xercesImpl = 2.9.1

Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1623227&r1=1623226&r2=1623227&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Sun Sep  7 20:54:53 2014
@@ -23,7 +23,7 @@ Consult the LUCENE_CHANGES.txt file for 
 
 Versions of Major Components
 ---------------------
-Apache Tika 1.5 (with upgraded Apache POI 3.10.1)
+Apache Tika 1.6
 Carrot2 3.9.0
 Velocity 1.7 and Velocity Tools 2.0
 Apache UIMA 2.3.1
@@ -115,6 +115,12 @@ Other Changes
 * SOLR-5322: core discovery can fail w/NPE and no explanation if a non-readable directory exists
   (Said Chavkin, Erick Erickson)
 
+* SOLR-6488: Update to Apache Tika 1.6. This adds support for parsing Outlook PST and Matlab
+  MAT files. Parsing for NetCDF files was removed because of license issues; if you need support
+  for this format, download the parser JAR yourself and add it to contrib/extraction/lib folder:
+  http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/
+  (Uwe Schindler)
+
 
 ==================  4.10.0 =================
 

Modified: lucene/dev/branches/branch_4x/solr/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/NOTICE.txt?rev=1623227&r1=1623226&r2=1623227&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/NOTICE.txt (original)
+++ lucene/dev/branches/branch_4x/solr/NOTICE.txt Sun Sep  7 20:54:53 2014
@@ -73,6 +73,9 @@ License: Common Development and Distribu
 The HdfsDirectory and BlockDirectory were derived from
 the Apache Blur incubating project and are Apache License 2.0.
 
+ASM (Java bytecode manipulation and analysis framework): http://asm.ow2.org/
+Copyright (c) 2000-2005 INRIA, France Telecom
+
 =========================================================================
 ==  Apache Lucene Notice                                               ==
 =========================================================================
@@ -336,8 +339,6 @@ Copyright (c) 2003-2005, www.fontbox.org
 
 Copyright (c) 1995-2005 International Business Machines Corporation and others
 
-Copyright (c) 2000-2005 INRIA, France Telecom
-
 Copyright 2001-2005 (C) MetaStuff, Ltd. All Rights Reserved.
 
 Copyright 2004 Sun Microsystems, Inc. (Rome JAR)
@@ -350,6 +351,12 @@ Copyright 2012 Kohei Taketa juniversalch
 
 Lasse Collin and others, XZ for Java (http://tukaani.org/xz/java.html)
 
+java-libpst is a pure java library for the reading of Outlook PST and OST files.
+https://github.com/rjohnsondev/java-libpst
+
+JMatIO is a JAVA library to read/write/manipulate with Matlab binary MAT-files.
+http://www.sourceforge.net/projects/jmatio
+
 =========================================================================
 ==  Language Detection Notices                                         ==
 =========================================================================

Modified: lucene/dev/branches/branch_4x/solr/contrib/extraction/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/extraction/ivy.xml?rev=1623227&r1=1623226&r2=1623227&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/extraction/ivy.xml (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/extraction/ivy.xml Sun Sep  7 20:54:53 2014
@@ -29,10 +29,10 @@
     <dependency org="org.apache.tika" name="tika-xmp" rev="${/org.apache.tika/tika-xmp}" conf="compile->*"/>
     <!-- Tika dependencies - see http://tika.apache.org/1.3/gettingstarted.html#Using_Tika_as_a_Maven_dependency -->
     <!-- When upgrading Tika, upgrade dependencies versions and add any new ones
-         (except slf4j-api, commons-codec, commons-logging, geronimo-stax-api_1.0_spec) -->
+         (except slf4j-api, commons-codec, commons-logging, commons-httpclient, geronimo-stax-api_1.0_spec, jcip-annotations, xml-apis, asm)
+         WARNING: Don't add netcdf / unidataCommon (partially LGPL code) -->
     <dependency org="org.gagravarr" name="vorbis-java-tika" rev="${/org.gagravarr/vorbis-java-tika}" conf="compile->*"/>
     <dependency org="org.gagravarr" name="vorbis-java-core" rev="${/org.gagravarr/vorbis-java-core}" conf="compile->*"/>
-    <dependency org="edu.ucar" name="netcdf" rev="${/edu.ucar/netcdf}" conf="compile->*"/>
     <dependency org="org.apache.james" name="apache-mime4j-core" rev="${/org.apache.james/apache-mime4j-core}" conf="compile->*"/>
     <dependency org="org.apache.james" name="apache-mime4j-dom" rev="${/org.apache.james/apache-mime4j-dom}" conf="compile->*"/>
     <dependency org="org.apache.commons" name="commons-compress" rev="${/org.apache.commons/commons-compress}" conf="compile->*"/>
@@ -46,7 +46,6 @@
     <dependency org="org.apache.poi" name="poi-ooxml" rev="${/org.apache.poi/poi-ooxml}" conf="compile->*"/>
     <dependency org="org.apache.poi" name="poi-ooxml-schemas" rev="${/org.apache.poi/poi-ooxml-schemas}" conf="compile->*"/>
     <dependency org="org.apache.xmlbeans" name="xmlbeans" rev="${/org.apache.xmlbeans/xmlbeans}" conf="compile->*"/>
-    <dependency org="dom4j" name="dom4j" rev="${/dom4j/dom4j}" conf="compile->*"/>
     <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="${/org.ccil.cowan.tagsoup/tagsoup}" conf="compile->*"/>
     <dependency org="com.googlecode.mp4parser" name="isoparser" rev="${/com.googlecode.mp4parser/isoparser}" conf="compile->*"/>
     <dependency org="org.aspectj" name="aspectjrt" rev="${/org.aspectj/aspectjrt}" conf="compile->*"/>
@@ -58,6 +57,8 @@
     <dependency org="org.tukaani" name="xz" rev="${/org.tukaani/xz}" conf="compile->*"/>
     <dependency org="com.adobe.xmp" name="xmpcore" rev="${/com.adobe.xmp/xmpcore}" conf="compile->*"/>
     <dependency org="com.uwyn" name="jhighlight" rev="${/com.uwyn/jhighlight}" conf="compile->*"/>
+    <dependency org="com.pff" name="java-libpst" rev="${/com.pff/java-libpst}" conf="compile->*"/>
+    <dependency org="net.sourceforge.jmatio" name="jmatio" rev="${/net.sourceforge.jmatio/jmatio}" conf="compile->*"/>
 
     <!-- Other ExtractingRequestHandler dependencies -->
     <dependency org="com.ibm.icu" name="icu4j" rev="${/com.ibm.icu/icu4j}" conf="compile->*"/>

Modified: lucene/dev/branches/branch_4x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1623227&r1=1623226&r2=1623227&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Sun Sep  7 20:54:53 2014
@@ -264,6 +264,7 @@ public class ExtractingRequestHandlerTes
               "fmap.language", "extractedLanguage",
               "literal.extractionLiteral", "one",
               "literal.extractionLiteral", "two",
+              "fmap.X-Parsed-By", "ignored_parser",
               "fmap.Last-Modified", "extractedDate"
       );
       // TODO: original author did not specify why an exception should be thrown... how to fix?
@@ -279,6 +280,7 @@ public class ExtractingRequestHandlerTes
             "literal.id", "three",
             "fmap.language", "extractedLanguage",
             "literal.extractionLiteral", "one",
+            "fmap.X-Parsed-By", "ignored_parser",
             "fmap.Last-Modified", "extractedDate"
     );
     assertU(commit());
@@ -297,6 +299,7 @@ public class ExtractingRequestHandlerTes
             "fmap.Author", "extractedAuthor",
             "literal.id", "one",
             "fmap.language", "extractedLanguage",
+            "fmap.X-Parsed-By", "ignored_parser",
             "fmap.content", "extractedContent",
             ExtractingParams.STREAM_TYPE, "text/plain"
     );
@@ -316,6 +319,7 @@ public class ExtractingRequestHandlerTes
             "fmap.Author", "extractedAuthor",
             "literal.id", "one",
             "fmap.language", "extractedLanguage",
+            "fmap.X-Parsed-By", "ignored_parser",
             "fmap.content", "extractedContent",
             ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
     );