You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by gs...@apache.org on 2010/08/05 15:45:58 UTC
svn commit: r982617 - in /lucene/solr/branches/branch-1.4: ./
contrib/extraction/lib/
contrib/extraction/src/main/java/org/apache/solr/handler/extraction/
src/java/org/apache/solr/core/
Author: gsingers
Date: Thu Aug 5 13:45:56 2010
New Revision: 982617
URL: http://svn.apache.org/viewvc?rev=982617&view=rev
Log:
SOLR-1902: upgrade Tika
Added:
lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar (with props)
lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar (with props)
Removed:
lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk14-136.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk14-136.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-0.1.0.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-3.8.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-0.2.0.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-0.7.3.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.5-beta6.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.4.jar
lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.4.jar
Modified:
lucene/solr/branches/branch-1.4/CHANGES.txt
lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java
Modified: lucene/solr/branches/branch-1.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/CHANGES.txt?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/CHANGES.txt (original)
+++ lucene/solr/branches/branch-1.4/CHANGES.txt Thu Aug 5 13:45:56 2010
@@ -25,6 +25,32 @@ default Locale, or Charset. It is recom
you set the following system properties:
-Duser.language=en -Duser.country=US
+================== 1.4.2-dev ==================
+Upgrading from Solr 1.4
+-----------------------
+
+This is a bug fix release - no changes are required when upgrading from Solr 1.4.
+However, a reindex is needed for some of the analysis fixes to take effect.
+
+Versions of Major Components
+----------------------------
+Apache Lucene 2.9.3
+Apache Tika 0.8-SNAPSHOT
+Carrot2 3.1.0
+
+Lucene Information
+----------------
+
+Since Solr is built on top of Lucene, many people add customizations to Solr
+that are dependent on Lucene. Please see http://lucene.apache.org/java/2_9_3/,
+especially http://lucene.apache.org/java/2_9_3/changes/Changes.html for more
+information on the version of Lucene used in Solr.
+
+Bug Fixes
+----------------------
+
+* SOLR-1902: Upgraded Tika to 0.8-SNAPSHOT (Tommaso Teofili, gsingers)
+
================== Release 1.4.1 ==================
Release Date: See http://lucene.apache.org/solr for the official release date.
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcmail-jdk15-1.45.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/bcprov-jdk15-1.45.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/fontbox-1.1.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/icu4j-4_2_1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/jempbox-1.1.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/pdfbox-1.1.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-3.6.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-3.6.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/poi-scratchpad-3.6.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tagsoup-1.2.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar?rev=982617&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/branch-1.4/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Thu Aug 5 13:45:56 2010
@@ -43,6 +43,8 @@ import org.apache.xml.serialize.XMLSeria
import org.apache.xml.serialize.TextSerializer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.mime.MediaType;
+
import java.io.IOException;
import java.io.InputStream;
@@ -134,7 +136,9 @@ public class ExtractingDocumentLoader ex
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
- parser = config.getParser(streamType.trim().toLowerCase());
+ MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
+ parser = config.getParser(mt);
+
} else {
parser = autoDetectParser;
}
Modified: lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java (original)
+++ lucene/solr/branches/branch-1.4/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java Thu Aug 5 13:45:56 2010
@@ -16,7 +16,6 @@
*/
package org.apache.solr.handler.extraction;
-
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.DateUtil;
@@ -31,15 +30,15 @@ import org.apache.tika.config.TikaConfig
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
+import java.io.IOException;
+import org.apache.tika.mime.MimeTypeException;
import java.io.File;
import java.util.Collection;
import java.util.HashSet;
-
/**
- * Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
- * first from the document.
+ * Handler for rich documents like PDF or Word or any other file format that Tika handles that need
+ * the text to be extracted first from the document.
* <p/>
*/
public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware {
@@ -47,14 +46,14 @@ public class ExtractingRequestHandler ex
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
public static final String CONFIG_LOCATION = "tika.config";
+
public static final String DATE_FORMATS = "date.formats";
protected TikaConfig config;
-
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
- protected SolrContentHandlerFactory factory;
+ protected SolrContentHandlerFactory factory;
@Override
public void init(NamedList args) {
@@ -63,7 +62,7 @@ public class ExtractingRequestHandler ex
public void inform(SolrCore core) {
if (initArgs != null) {
- //if relative,then relative to config dir, otherwise, absolute path
+ // if relative,then relative to config dir, otherwise, absolute path
String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
if (tikaConfigLoc != null) {
File configFile = new File(tikaConfigLoc);
@@ -75,12 +74,6 @@ public class ExtractingRequestHandler ex
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
- } else {
- try {
- config = TikaConfig.getDefaultConfig();
- } catch (TikaException e) {
- throw new SolrException(ErrorCode.SERVER_ERROR, e);
- }
}
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
@@ -91,12 +84,16 @@ public class ExtractingRequestHandler ex
dateFormats.add(format);
}
}
- } else {
+ }
+ if (config == null) {
try {
- config = TikaConfig.getDefaultConfig();
- } catch (TikaException e) {
+ config = getDefaultConfig(core.getResourceLoader().getClassLoader());
+ } catch (MimeTypeException e) {
+ throw new SolrException(ErrorCode.SERVER_ERROR, e);
+ } catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
+
}
factory = createFactory();
}
@@ -105,7 +102,6 @@ public class ExtractingRequestHandler ex
return new SolrContentHandlerFactory(dateFormats);
}
-
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
return new ExtractingDocumentLoader(req, processor, config, factory);
}
@@ -130,6 +126,9 @@ public class ExtractingRequestHandler ex
public String getSource() {
return "$URL:$";
}
-}
-
+
+ private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
+ return new TikaConfig(classLoader);
+ }
+}
Modified: lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java?rev=982617&r1=982616&r2=982617&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java (original)
+++ lucene/solr/branches/branch-1.4/src/java/org/apache/solr/core/SolrResourceLoader.java Thu Aug 5 13:45:56 2010
@@ -661,4 +661,13 @@ public class SolrResourceLoader implemen
}
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, builder.toString() );
}
+ /**
+ + * EXPERT
+ + * <p/>
+ + * The underlying class loader. Most applications will not need to use this.
+ + * @return The {@link ClassLoader}
+ + */
+ public ClassLoader getClassLoader() {
+ return classLoader;
+ }
}