You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/02/13 02:00:31 UTC
svn commit: r1659446 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDFParser.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: tallison
Date: Fri Feb 13 01:00:31 2015
New Revision: 1659446
URL: http://svn.apache.org/r1659446
Log:
TIKA-1548 improve handling of encrypted pdfs when wrong password is offered
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1659446&r1=1659445&r2=1659446&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Feb 13 01:00:31 2015
@@ -33,12 +33,14 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessFile;
-import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.CloseShieldInputStream;
@@ -133,11 +135,7 @@ public class PDFParser extends AbstractP
//if using the classic parser and the doc is encrypted, we must manually decrypt
if (! localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) {
- try {
- pdfDocument.decrypt(password);
- } catch (Exception e) {
- // Ignore
- }
+ pdfDocument.decrypt(password);
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
@@ -146,6 +144,18 @@ public class PDFParser extends AbstractP
PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
}
+ } catch (CryptographyException e) {
+ //seq parser throws CryptographyException for bad password
+ throw new EncryptedDocumentException(e);
+ } catch (IOException e) {
+ //nonseq parser throws IOException for bad password
+ //At the Tika level, we want the same exception to be thrown
+ if (e.getMessage().contains("Error (CryptographyException)")) {
+ metadata.set("pdf:encrypted", Boolean.toString(true));
+ throw new EncryptedDocumentException(e);
+ }
+ //rethrow any other IOExceptions
+ throw e;
} finally {
if (pdfDocument != null) {
pdfDocument.close();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1659446&r1=1659445&r2=1659446&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Feb 13 01:00:31 2015
@@ -16,6 +16,11 @@
*/
package org.apache.tika.parser.pdf;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
@@ -27,6 +32,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
@@ -48,11 +54,6 @@ import org.apache.tika.sax.ContentHandle
import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
/**
* Test case for parsing pdf files.
*/
@@ -203,6 +204,67 @@ public class PDFParserTest extends TikaT
assertContains("RETHINKING THE FINANCIAL NETWORK", content);
assertContains("On 16 November 2002", content);
assertContains("In many important respects", content);
+
+ //now test wrong password
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ public String getPassword(Metadata metadata) {
+ return "WRONG!!!!";
+ }
+ });
+
+ stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF_protected.pdf");
+ boolean ex = false;
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } catch (EncryptedDocumentException e) {
+ ex = true;
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+
+ assertTrue("encryption exception", ex);
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("true", metadata.get("pdf:encrypted"));
+ //pdf:encrypted, X-Parsed-By and Content-Type
+ assertEquals("very little metadata should be parsed", 3, metadata.names().length);
+ assertEquals(0, content.length());
+
+ //now test wrong password with non sequential parser
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ public String getPassword(Metadata metadata) {
+ return "WRONG!!!!";
+ }
+ });
+ PDFParserConfig config = new PDFParserConfig();
+ config.setUseNonSequentialParser(true);
+ context.set(PDFParserConfig.class, config);
+
+ stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF_protected.pdf");
+ ex = false;
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } catch (EncryptedDocumentException e) {
+ ex = true;
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ assertTrue("encryption exception", ex);
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("true", metadata.get("pdf:encrypted"));
+
+ //pdf:encrypted, X-Parsed-By and Content-Type
+ assertEquals("very little metadata should be parsed", 3, metadata.names().length);
+ assertEquals(0, content.length());
}
@Test