You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ko...@apache.org on 2011/05/14 17:08:58 UTC
svn commit: r1103124 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/backwards/ solr/ solr/contrib/extraction/
solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/
solr/contrib/extraction/src/test/java/org/apache/solr/handle...
Author: koji
Date: Sat May 14 15:08:57 2011
New Revision: 1103124
URL: http://svn.apache.org/viewvc?rev=1103124&view=rev
Log:
SOLR-2480: add ignoreTikaException flag
Added:
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/
- copied from r1103120, lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
- copied unchanged from r1103120, lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/resources/password-is-solrcell.docx
- copied unchanged from r1103120, lucene/dev/trunk/solr/contrib/extraction/src/test/resources/password-is-solrcell.docx
Removed:
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1103124&r1=1103123&r2=1103124&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Sat May 14 15:08:57 2011
@@ -22,11 +22,12 @@ to your Solr Home lib directory. See ht
Current Version: Tika 0.8 (released 11/07/2010)
-$Id:$
+$Id$
================== Release 3.2-dev ==================
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+ meta data. (Shinichiro Abe, koji)
================== Release 3.1-dev ==================
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1103124&r1=1103123&r2=1103124&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Sat May 14 15:08:57 2011
@@ -16,20 +16,27 @@
*/
package org.apache.solr.handler.extraction;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -38,25 +45,28 @@ import org.apache.tika.sax.xpath.Matcher
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
-import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.mime.MediaType;
-
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
-
/**
* The class responsible for loading extracted content into Solr.
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+ private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
/**
* Extract Only supported format
*/
@@ -74,6 +84,7 @@ public class ExtractingDocumentLoader ex
final IndexSchema schema;
final SolrParams params;
final UpdateRequestProcessor processor;
+ final boolean ignoreTikaException;
protected AutoDetectParser autoDetectParser;
private final AddUpdateCommand templateAdd;
@@ -106,6 +117,8 @@ public class ExtractingDocumentLoader ex
//this is lightweight
autoDetectParser = new AutoDetectParser(config);
this.factory = factory;
+
+ ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
@@ -191,9 +204,17 @@ public class ExtractingDocumentLoader ex
parsingHandler = new MatchingContentHandler(handler, matcher);
} //else leave it as is
- //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
- ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
- parser.parse(inputStream, parsingHandler, metadata, context);
+ try{
+ //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+ ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+ parser.parse(inputStream, parsingHandler, metadata, context);
+ } catch (TikaException e) {
+ if(ignoreTikaException)
+ log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+ .append(". metadata=").append(metadata.toString()).toString());
+ else
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
if (extractOnly == false) {
addDoc(handler);
} else {
@@ -213,8 +234,6 @@ public class ExtractingDocumentLoader ex
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
- } catch (TikaException e) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1103124&r1=1103123&r2=1103124&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Sat May 14 15:08:57 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
*/
public static final String LOWERNAMES = "lowernames";
+ /**
+ * if true, ignore TikaException (give up to extract text but index meta data)
+ */
+ public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
/**
* The param prefix for mapping Tika metadata to Solr fields.