You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/05/10 16:36:54 UTC

svn commit: r942753 - in /lucene/dev/trunk/solr: ./ contrib/extraction/ contrib/extraction/lib/ contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ src/java/org/apache/solr/core/

Author: gsingers
Date: Mon May 10 14:36:54 2010
New Revision: 942753

URL: http://svn.apache.org/viewvc?rev=942753&view=rev
Log:
SOLR-1902: fix Tika extraction issue

Added:
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar   (with props)
Removed:
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.7.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.7.jar
Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
    lucene/dev/trunk/solr/src/java/org/apache/solr/core/SolrResourceLoader.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=942753&r1=942752&r2=942753&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon May 10 14:36:54 2010
@@ -285,6 +285,8 @@ Bug Fixes
 * SOLR-1706: fixed WordDelimiterFilter for certain combinations of options
   where it would output incorrect tokens. (Robert Muir, Chris Male)
 
+* SOLR-1902: Exposed SolrResourceLoader's class loader for use by Tika  
+
 Other Changes
 ----------------------
 

Modified: lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt?rev=942753&r1=942752&r2=942753&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt Mon May 10 14:36:54 2010
@@ -17,21 +17,23 @@ You will need Solr up and running.  Then
 to your Solr Home lib directory.  See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in
  and configuring.
 
+ Tika Dependency
+ ---------------
+
+Current Version: Tika 0.8-SNAPSHOT (rev 942725)
+
 $Id:$
 
 ================== Release 1.5-dev ==================
 
 
-* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers)
 
 * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
   parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
 
-* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
-
 * SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
 
-* SOLR-1819: Upgraded to Tika 0.7 (gsingers)
+* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers)
 
 ================== Release 1.4.0 ==================
 

Added: lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar?rev=942753&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar?rev=942753&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=942753&r1=942752&r2=942753&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Mon May 10 14:36:54 2010
@@ -37,6 +37,7 @@ import org.apache.tika.sax.xpath.Matcher
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
 import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.BaseMarkupSerializer;
 import org.apache.xml.serialize.XMLSerializer;
@@ -134,7 +135,8 @@ public class ExtractingDocumentLoader ex
     String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
     if (streamType != null) {
       //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
-      parser = config.getParser(streamType.trim().toLowerCase());
+      MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
+      parser = config.getParser(mt);
     } else {
       parser = autoDetectParser;
     }

Modified: lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java?rev=942753&r1=942752&r2=942753&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java Mon May 10 14:36:54 2010
@@ -29,10 +29,12 @@ import org.apache.solr.handler.ContentSt
 import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MimeTypeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.File;
+import java.io.IOException;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -77,8 +79,6 @@ public class ExtractingRequestHandler ex
         } catch (Exception e) {
           throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
-      } else {
-        config = TikaConfig.getDefaultConfig();
       }
       NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
       if (configDateFormats != null && configDateFormats.size() > 0) {
@@ -90,12 +90,23 @@ public class ExtractingRequestHandler ex
           dateFormats.add(format);
         }
       }
-    } else {
-      config = TikaConfig.getDefaultConfig();
+    }
+    if (config == null) {
+      try {
+        config = getDefaultConfig(core.getResourceLoader().getClassLoader());
+      } catch (MimeTypeException e) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      } catch (IOException e) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      }
     }
     factory = createFactory();
   }
 
+  private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
+    return new TikaConfig(classLoader);
+  }
+
   protected SolrContentHandlerFactory createFactory() {
     return new SolrContentHandlerFactory(dateFormats);
   }

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/core/SolrResourceLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/core/SolrResourceLoader.java?rev=942753&r1=942752&r2=942753&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/core/SolrResourceLoader.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/core/SolrResourceLoader.java Mon May 10 14:36:54 2010
@@ -214,6 +214,16 @@ public class SolrResourceLoader implemen
     return coreProperties;
   }
 
+  /**
+   * EXPERT
+   * <p/>
+   * The underlying class loader.  Most applications will not need to use this.
+   * @return The {@link ClassLoader}
+   */
+  public ClassLoader getClassLoader() {
+    return classLoader;
+  }
+
   /** Opens a schema resource by its name.
    * Override this method to customize loading schema resources.
    *@return the stream for the named schema