You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by gs...@apache.org on 2010/08/05 15:45:58 UTC

svn commit: r982617 - in /lucene/solr/branches/branch-1.4: ./ contrib/extraction/lib/ contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ src/java/org/apache/solr/core/

Author: gsingers
Date: Thu Aug  5 13:45:56 2010
New Revision: 982617

URL: http://svn.apache.org/viewvc?rev=982617&view=rev
Log:
SOLR-1902: upgrade Tika

Added:
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar   (with props)
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar   (with props)
Removed:
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk14-136.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk14-136.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-0.1.0.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-3.8.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-0.2.0.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-0.7.3.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.5-beta6.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.4.jar
    lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.4.jar
Modified:
    lucene/solr/branches/branch-1.4/CHANGES.txt
    lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
    lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java

Modified: lucene/solr/branches/branch-1.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/CHANGES.txt?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/CHANGES.txt (original)
+++ lucene/solr/branches/branch-1.4/CHANGES.txt Thu Aug  5 13:45:56 2010
@@ -25,6 +25,32 @@ default Locale, or Charset.  It is recom
 you set the following system properties:
   -Duser.language=en -Duser.country=US
 
+================== 1.4.2-dev ==================
+Upgrading from Solr 1.4
+-----------------------
+
+This is a bug fix release - no changes are required when upgrading from Solr 1.4.
+However, a reindex is needed for some of the analysis fixes to take effect.
+
+Versions of Major Components
+----------------------------
+Apache Lucene 2.9.3
+Apache Tika 0.8-SNAPSHOT
+Carrot2 3.1.0
+
+Lucene Information
+----------------
+
+Since Solr is built on top of Lucene, many people add customizations to Solr
+that are dependent on Lucene.  Please see http://lucene.apache.org/java/2_9_3/,
+especially http://lucene.apache.org/java/2_9_3/changes/Changes.html for more
+information on the version of Lucene used in Solr.
+
+Bug Fixes
+----------------------
+
+* SOLR-1902: Upgraded Tika to 0.8-SNAPSHOT (Tommaso Teofili, gsingers)
+
 ================== Release 1.4.1 ==================
 Release Date:  See http://lucene.apache.org/solr for the official release date.
 

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Thu Aug  5 13:45:56 2010
@@ -43,6 +43,8 @@ import org.apache.xml.serialize.XMLSeria
 import org.apache.xml.serialize.TextSerializer;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.apache.tika.mime.MediaType;
+
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -134,7 +136,9 @@ public class ExtractingDocumentLoader ex
     String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
     if (streamType != null) {
       //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
-      parser = config.getParser(streamType.trim().toLowerCase());
+      MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
+      parser = config.getParser(mt);
+
     } else {
       parser = autoDetectParser;
     }

Modified: lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java (original)
+++ lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java Thu Aug  5 13:45:56 2010
@@ -16,7 +16,6 @@
  */
 package org.apache.solr.handler.extraction;
 
-
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.util.DateUtil;
@@ -31,15 +30,15 @@ import org.apache.tika.config.TikaConfig
 import org.apache.tika.exception.TikaException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
+import java.io.IOException;
+import org.apache.tika.mime.MimeTypeException;
 import java.io.File;
 import java.util.Collection;
 import java.util.HashSet;
 
-
 /**
- * Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
- * first from the document.
+ * Handler for rich documents like PDF or Word or any other file format that Tika handles that need
+ * the text to be extracted first from the document.
  * <p/>
  */
 public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware {
@@ -47,14 +46,14 @@ public class ExtractingRequestHandler ex
   private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
 
   public static final String CONFIG_LOCATION = "tika.config";
+
   public static final String DATE_FORMATS = "date.formats";
 
   protected TikaConfig config;
 
-
   protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
-  protected SolrContentHandlerFactory factory;
 
+  protected SolrContentHandlerFactory factory;
 
   @Override
   public void init(NamedList args) {
@@ -63,7 +62,7 @@ public class ExtractingRequestHandler ex
 
   public void inform(SolrCore core) {
     if (initArgs != null) {
-      //if relative,then relative to config dir, otherwise, absolute path
+      // if relative,then relative to config dir, otherwise, absolute path
       String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
       if (tikaConfigLoc != null) {
         File configFile = new File(tikaConfigLoc);
@@ -75,12 +74,6 @@ public class ExtractingRequestHandler ex
         } catch (Exception e) {
           throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
-      } else {
-        try {
-          config = TikaConfig.getDefaultConfig();
-        } catch (TikaException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, e);
-        }
       }
       NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
       if (configDateFormats != null && configDateFormats.size() > 0) {
@@ -91,12 +84,16 @@ public class ExtractingRequestHandler ex
           dateFormats.add(format);
         }
       }
-    } else {
+    }
+    if (config == null) {
       try {
-        config = TikaConfig.getDefaultConfig();
-      } catch (TikaException e) {
+        config = getDefaultConfig(core.getResourceLoader().getClassLoader());
+      } catch (MimeTypeException e) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      } catch (IOException e) {
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
       }
+
     }
     factory = createFactory();
   }
@@ -105,7 +102,6 @@ public class ExtractingRequestHandler ex
     return new SolrContentHandlerFactory(dateFormats);
   }
 
-
   protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
     return new ExtractingDocumentLoader(req, processor, config, factory);
   }
@@ -130,6 +126,9 @@ public class ExtractingRequestHandler ex
   public String getSource() {
     return "$URL:$";
   }
-}
-
+  
+  private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
+    return new TikaConfig(classLoader);
+  }
 
+}

Modified: lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java (original)
+++ lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java Thu Aug  5 13:45:56 2010
@@ -661,4 +661,13 @@ public class SolrResourceLoader implemen
     }
     throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, builder.toString() );
   }
+  /**
+  +   * EXPERT
+  +   * <p/>
+  +   * The underlying class loader.  Most applications will not need to use this.
+  +   * @return The {@link ClassLoader}
+  +   */
+  public ClassLoader getClassLoader() {
+      return classLoader;
+    }
 }