You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ko...@apache.org on 2011/05/14 17:08:58 UTC

svn commit: r1103124 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/ solr/ solr/contrib/extraction/ solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ solr/contrib/extraction/src/test/java/org/apache/solr/handle...

Author: koji
Date: Sat May 14 15:08:57 2011
New Revision: 1103124

URL: http://svn.apache.org/viewvc?rev=1103124&view=rev
Log:
SOLR-2480: add ignoreTikaException flag

Added:
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/
      - copied from r1103120, lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
      - copied unchanged from r1103120, lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/resources/password-is-solrcell.docx
      - copied unchanged from r1103120, lucene/dev/trunk/solr/contrib/extraction/src/test/resources/password-is-solrcell.docx
Removed:
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1103124&r1=1103123&r2=1103124&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Sat May 14 15:08:57 2011
@@ -22,11 +22,12 @@ to your Solr Home lib directory.  See ht
 
 Current Version: Tika 0.8 (released 11/07/2010)
 
-$Id:$
+$Id$
 
 ================== Release 3.2-dev ==================
 
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+  meta data. (Shinichiro Abe, koji)
 
 ================== Release 3.1-dev ==================
 

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1103124&r1=1103123&r2=1103124&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Sat May 14 15:08:57 2011
@@ -16,20 +16,27 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -38,25 +45,28 @@ import org.apache.tika.sax.xpath.Matcher
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
 import org.apache.tika.exception.TikaException;
-import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.apache.tika.mime.MediaType;
 
-
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
 
-
 /**
  * The class responsible for loading extracted content into Solr.
  *
  **/
 public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+  private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
   /**
    * Extract Only supported format
    */
@@ -74,6 +84,7 @@ public class ExtractingDocumentLoader ex
   final IndexSchema schema;
   final SolrParams params;
   final UpdateRequestProcessor processor;
+  final boolean ignoreTikaException;
   protected AutoDetectParser autoDetectParser;
 
   private final AddUpdateCommand templateAdd;
@@ -106,6 +117,8 @@ public class ExtractingDocumentLoader ex
     //this is lightweight
     autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;
+    
+    ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
   }
 
 
@@ -191,9 +204,17 @@ public class ExtractingDocumentLoader ex
           parsingHandler = new MatchingContentHandler(handler, matcher);
         } //else leave it as is
 
-        //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-        ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
-        parser.parse(inputStream, parsingHandler, metadata, context);
+        try{
+          //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          parser.parse(inputStream, parsingHandler, metadata, context);
+        } catch (TikaException e) {
+          if(ignoreTikaException)
+            log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+                .append(". metadata=").append(metadata.toString()).toString());
+          else
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
         if (extractOnly == false) {
           addDoc(handler);
         } else {
@@ -213,8 +234,6 @@ public class ExtractingDocumentLoader ex
         }
       } catch (SAXException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-      } catch (TikaException e) {
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       } finally {
         IOUtils.closeQuietly(inputStream);
       }

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1103124&r1=1103123&r2=1103124&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Sat May 14 15:08:57 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
    */
   public static final String LOWERNAMES = "lowernames";
 
+  /**
+   * if true, ignore TikaException (give up to extract text but index meta data)
+   */
+  public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
 
   /**
    * The param prefix for mapping Tika metadata to Solr fields.