You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2013/02/20 19:03:12 UTC
svn commit: r1448325 - in /tika/trunk: ./
tika-app/src/main/java/org/apache/tika/cli/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Wed Feb 20 18:03:11 2013
New Revision: 1448325
URL: http://svn.apache.org/r1448325
Log:
TIKA-1074: log certain exceptions and continue
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Feb 20 18:03:11 2013
@@ -17,6 +17,11 @@ Release 1.4 Current Development
* Mime Types: Definitions extended to optionally include Link (URL) and
UTI, along with details for several common formats (TIKA-1012 / TIKA-1083)
+ * Exceptions when parsing OLE10 embedded documents, when parsing
+ summary information from Office documents, and when saving
+ embedded documennts in TikaCLI are now logged instead
+ of aborting extraction (TIKA-1074)
+
Release 1.3 - 01/19/2013
* Mimetype definitions added for more common programming languages,
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Feb 20 18:03:11 2013
@@ -38,16 +38,17 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Map.Entry;
+import java.util.Map;
import java.util.Set;
-
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
@@ -89,7 +90,6 @@ import org.apache.tika.xmp.XMPMetadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-
import com.google.gson.Gson;
/**
@@ -98,6 +98,8 @@ import com.google.gson.Gson;
public class TikaCLI {
private File extractDir = new File(".");
+ private static final Log logger = LogFactory.getLog(TikaCLI.class);
+
public static void main(String[] args) throws Exception {
BasicConfigurator.configure(
new WriterAppender(new SimpleLayout(), System.err));
@@ -719,23 +721,31 @@ public class TikaCLI {
}
System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
- FileOutputStream os = new FileOutputStream(outputFile);
+ FileOutputStream os = null;
- if (inputStream instanceof TikaInputStream) {
- TikaInputStream tin = (TikaInputStream) inputStream;
+ try {
+ os = new FileOutputStream(outputFile);
- if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
- POIFSFileSystem fs = new POIFSFileSystem();
- copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
- fs.writeFilesystem(os);
+ if (inputStream instanceof TikaInputStream) {
+ TikaInputStream tin = (TikaInputStream) inputStream;
+
+ if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
+ POIFSFileSystem fs = new POIFSFileSystem();
+ copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
+ fs.writeFilesystem(os);
+ } else {
+ IOUtils.copy(inputStream, os);
+ }
} else {
IOUtils.copy(inputStream, os);
}
- } else {
- IOUtils.copy(inputStream, os);
+ } catch (Throwable t) {
+ logger.warn("Ignoring unexpected exception trying to save embedded file " + name, t);
+ } finally {
+ if (os != null) {
+ os.close();
+ }
}
-
- os.close();
}
protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Feb 20 18:03:11 2013
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft
import java.io.FileNotFoundException;
import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -48,6 +50,7 @@ abstract class AbstractPOIFSExtractor {
private TikaConfig tikaConfig;
private MimeTypes mimeTypes;
private Detector detector;
+ private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
protected AbstractPOIFSExtractor(ParseContext context) {
EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
@@ -152,6 +155,8 @@ abstract class AbstractPOIFSExtractor {
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
+ } catch (Throwable t) {
+ logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), t);
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Wed Feb 20 18:03:11 2013
@@ -20,6 +20,8 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.MarkUnsupportedException;
@@ -46,6 +48,8 @@ import org.apache.tika.metadata.TikaCore
*/
class SummaryExtractor {
+ private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+
private static final String SUMMARY_INFORMATION =
SummaryInformation.DEFAULT_STREAM_NAME;
@@ -91,6 +95,8 @@ class SummaryExtractor {
throw new TikaException("Unexpected HPSF document", e);
} catch (MarkUnsupportedException e) {
throw new TikaException("Invalid DocumentInputStream", e);
+ } catch (Throwable t) {
+ logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, t);
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1448325&r1=1448324&r2=1448325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Wed Feb 20 18:03:11 2013
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft
import java.io.InputStream;
import java.util.Locale;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -314,4 +316,18 @@ public class WordParserTest extends Tika
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
}
+
+ public void testExceptions1() throws Exception {
+ XMLResult xml;
+ Level logLevelStart = Logger.getRootLogger().getLevel();
+ Logger.getRootLogger().setLevel(Level.ERROR);
+ try {
+ xml = getXML("testException1.doc");
+ assertContains("total population", xml.xml);
+ xml = getXML("testException2.doc");
+ assertContains("electric charge", xml.xml);
+ } finally {
+ Logger.getRootLogger().setLevel(logLevelStart);
+ }
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc?rev=1448325&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException1.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc?rev=1448325&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testException2.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword