You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2013/02/20 19:03:12 UTC

svn commit: r1448325 - in /tika/trunk: ./ tika-app/src/main/java/org/apache/tika/cli/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Wed Feb 20 18:03:11 2013
New Revision: 1448325

URL: http://svn.apache.org/r1448325
Log:
TIKA-1074: log certain exceptions and continue

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Feb 20 18:03:11 2013
@@ -17,6 +17,11 @@ Release 1.4 Current Development
   * Mime Types: Definitions extended to optionally include Link (URL) and
     UTI, along with details for several common formats (TIKA-1012 / TIKA-1083)
 
+  * Exceptions when parsing OLE10 embedded documents, when parsing
+    summary information from Office documents, and when saving
+    embedded documennts in TikaCLI are now logged instead
+    of aborting extraction (TIKA-1074)
+
 Release 1.3 - 01/19/2013
 
   * Mimetype definitions added for more common programming languages,

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Feb 20 18:03:11 2013
@@ -38,16 +38,17 @@ import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Map;
 import java.util.Set;
-
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.log4j.BasicConfigurator;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
@@ -89,7 +90,6 @@ import org.apache.tika.xmp.XMPMetadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
-
 import com.google.gson.Gson;
 
 /**
@@ -98,6 +98,8 @@ import com.google.gson.Gson;
 public class TikaCLI {
     private File extractDir = new File(".");
 
+    private static final Log logger = LogFactory.getLog(TikaCLI.class);
+
     public static void main(String[] args) throws Exception {
         BasicConfigurator.configure(
                 new WriterAppender(new SimpleLayout(), System.err));
@@ -719,23 +721,31 @@ public class TikaCLI {
             }
             System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
 
-            FileOutputStream os = new FileOutputStream(outputFile);
+            FileOutputStream os = null;
 
-            if (inputStream instanceof TikaInputStream) {
-                TikaInputStream tin = (TikaInputStream) inputStream;
+            try {
+                os = new FileOutputStream(outputFile);
 
-                if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
-                    POIFSFileSystem fs = new POIFSFileSystem();
-                    copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
-                    fs.writeFilesystem(os);
+                if (inputStream instanceof TikaInputStream) {
+                    TikaInputStream tin = (TikaInputStream) inputStream;
+
+                    if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
+                        POIFSFileSystem fs = new POIFSFileSystem();
+                        copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
+                        fs.writeFilesystem(os);
+                    } else {
+                        IOUtils.copy(inputStream, os);
+                    }
                 } else {
                     IOUtils.copy(inputStream, os);
                 }
-            } else {
-                IOUtils.copy(inputStream, os);
+            } catch (Throwable t) {
+                logger.warn("Ignoring unexpected exception trying to save embedded file " + name, t);
+            } finally {
+                if (os != null) {
+                    os.close();
+                }
             }
-
-            os.close();
         }
 
         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Feb 20 18:03:11 2013
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft
 import java.io.FileNotFoundException;
 import java.io.IOException;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -48,6 +50,7 @@ abstract class AbstractPOIFSExtractor {
     private TikaConfig tikaConfig;
     private MimeTypes mimeTypes;
     private Detector detector;
+    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
 
     protected AbstractPOIFSExtractor(ParseContext context) {
         EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
@@ -152,6 +155,8 @@ abstract class AbstractPOIFSExtractor {
                     embedded = TikaInputStream.get(data);
                 } catch (Ole10NativeException ex) {
                     // Not a valid OLE10Native record, skip it
+                } catch (Throwable t) {
+                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), t);
                 }
             } else if (type == POIFSDocumentType.COMP_OBJ) {
                 try {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Wed Feb 20 18:03:11 2013
@@ -20,6 +20,8 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.Date;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.poi.hpsf.CustomProperties;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.MarkUnsupportedException;
@@ -46,6 +48,8 @@ import org.apache.tika.metadata.TikaCore
  */
 class SummaryExtractor {
 
+    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+
     private static final String SUMMARY_INFORMATION =
         SummaryInformation.DEFAULT_STREAM_NAME;
 
@@ -91,6 +95,8 @@ class SummaryExtractor {
             throw new TikaException("Unexpected HPSF document", e);
         } catch (MarkUnsupportedException e) {
             throw new TikaException("Invalid DocumentInputStream", e);
+        } catch (Throwable t) {
+            logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, t);
         }
     }
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Wed Feb 20 18:03:11 2013
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft
 import java.io.InputStream;
 import java.util.Locale;
 
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -314,4 +316,18 @@ public class WordParserTest extends Tika
        assertEquals("MyStringValue",        metadata.get("custom:MyCustomString"));
        assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
     }
+
+    public void testExceptions1() throws Exception {
+      XMLResult xml;
+      Level logLevelStart = Logger.getRootLogger().getLevel();
+      Logger.getRootLogger().setLevel(Level.ERROR);
+      try {
+        xml = getXML("testException1.doc");
+        assertContains("total population", xml.xml);
+        xml = getXML("testException2.doc");
+        assertContains("electric charge", xml.xml);
+      } finally {
+        Logger.getRootLogger().setLevel(logLevelStart);
+      }
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc?rev=1448325&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc?rev=1448325&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword