You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/27 19:03:24 UTC
svn commit: r1331503 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/iwork/
tika-parsers/src/test/java/org/apache/tika/parser/iwork/
tika-parsers/src/test/resources/test-documents/
Author: nick
Date: Fri Apr 27 17:03:23 2012
New Revision: 1331503
URL: http://svn.apache.org/viewvc?rev=1331503&view=rev
Log:
TIKA-903 Avoid breaking on Password Protected iWorks files. We can't parse them yet though, as we don't know how the encryption works
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages (with props)
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1331503&r1=1331502&r2=1331503&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Apr 27 17:03:23 2012
@@ -641,6 +641,10 @@
<sub-class-of type="application/vnd.apple.iwork" />
<glob pattern="*.numbers"/>
</mime-type>
+ <mime-type type="application/x-tika-iworks-protected">
+ <sub-class-of type="application/vnd.apple.iwork" />
+ <_comment>Password Protected iWorks File</_comment>
+ </mime-type>
<mime-type type="application/vnd.arastra.swi">
<glob pattern="*.swi"/>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java?rev=1331503&r1=1331502&r2=1331503&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java Fri Apr 27 17:03:23 2012
@@ -26,6 +26,7 @@ import java.util.Set;
import javax.xml.namespace.QName;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
@@ -71,7 +72,8 @@ public class IWorkPackageParser extends
public enum IWORKDocumentType {
KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
- PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages"));
+ PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
+ ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
private final String namespace;
private final String part;
@@ -122,13 +124,26 @@ public class IWorkPackageParser extends
private static IWORKDocumentType detectType(InputStream stream) {
QName qname = new XmlRootExtractor().extractRootElement(stream);
- String uri = qname.getNamespaceURI();
- String local = qname.getLocalPart();
-
- for (IWORKDocumentType type : values()) {
- if(type.getNamespace().equals(uri) &&
- type.getPart().equals(local)) {
- return type;
+ if (qname != null) {
+ String uri = qname.getNamespaceURI();
+ String local = qname.getLocalPart();
+
+ for (IWORKDocumentType type : values()) {
+ if(type.getNamespace().equals(uri) &&
+ type.getPart().equals(local)) {
+ return type;
+ }
+ }
+ } else {
+ // There was a problem with extracting the root type
+ // Password Protected iWorks files are funny, but we can usually
+ // spot them because they encrypt part of the zip stream
+ try {
+ stream.read();
+ } catch(UnsupportedZipFeatureException e) {
+ // Compression field was likely encrypted
+ return ENCRYPTED;
+ } catch(Exception ignored) {
}
}
return null;
@@ -180,16 +195,22 @@ public class IWorkPackageParser extends
case PAGES:
contentHandler = new PagesContentHandler(xhtml, metadata);
break;
+ case ENCRYPTED:
+ // We can't do anything for the file right now
+ contentHandler = null;
+ break;
default:
throw new TikaException("Unhandled iWorks file " + type);
}
metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
xhtml.startDocument();
- context.getSAXParser().parse(
- new CloseShieldInputStream(entryStream),
- new OfflineContentHandler(contentHandler)
- );
+ if (contentHandler != null) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(entryStream),
+ new OfflineContentHandler(contentHandler)
+ );
+ }
xhtml.endDocument();
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1331503&r1=1331502&r2=1331503&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri Apr 27 17:03:23 2012
@@ -139,4 +139,24 @@ public class IWorkParserTest extends Tes
assertTrue(content.contains("Try adding your own account transactions to this table."));
}
+ /**
+ * We don't currently support password protected Pages files, as
+ * we don't know how the encryption works (it's not regular Zip
+ * Encryption). See TIKA-903 for details
+ */
+ public void testParsePagesPasswordProtected() throws Exception {
+ // Document password is "tika", but we can't use that yet...
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Content will be empty
+ String content = handler.toString();
+ assertEquals("", content);
+
+ // Will have been identified as encrypted
+ assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages?rev=1331503&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages
------------------------------------------------------------------------------
svn:mime-type = application/zip