You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/03/03 19:51:42 UTC
svn commit: r1663764 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/exception/
tika-core/src/main/java/org/apache/tika/metadata/
tika-parsers/src/main/java/org/apache/tika/parser/pdf/
tika-parsers/src/main/resources/org/apache/tika/parser/pd...
Author: tallison
Date: Tue Mar 3 18:51:41 2015
New Revision: 1663764
URL: http://svn.apache.org/r1663764
Log:
TIKA-1489 add optional accessibility checking to PDF files
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java Tue Mar 3 18:51:41 2015
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.exception;
+
+/**
+ * Exception to be thrown when a document does not allow content extraction.
+ * As of this writing, PDF documents are the only type of document that might
+ * cause this type of exception.
+ */
+public class AccessPermissionException extends TikaException {
+ public AccessPermissionException() {
+ super("Unable to process: content extraction is not allowed");
+ }
+
+ public AccessPermissionException(Throwable th) {
+ super("Unable to process: content extraction is not allowed", th);
+ }
+
+ public AccessPermissionException(String info) {
+ super(info);
+ }
+
+ public AccessPermissionException(String info, Throwable th) {
+ super(info, th);
+ }
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java Tue Mar 3 18:51:41 2015
@@ -0,0 +1,71 @@
+package org.apache.tika.metadata;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Until we can find a common standard, we'll use these options. They
+ * were mostly derived from PDFBox's AccessPermission, but some can
+ * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM.
+ */
+public interface AccessPermissions {
+
+ final static String PREFIX = "access_permission"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+ /**
+ * Can any modifications be made to the document
+ */
+ Property CAN_MODIFY = Property.externalTextBag(PREFIX+"can_modify");
+
+ /**
+ * Should content be extracted, generally.
+ */
+ Property EXTRACT_CONTENT = Property.externalText(PREFIX+"extract_content");
+
+ /**
+ * Should content be extracted for the purposes
+ * of accessibility.
+ */
+ Property EXTRACT_FOR_ACCESSIBILITY = Property.externalText(PREFIX + "extract_for_accessibility");
+
+ /**
+ * Can the user insert/rotate/delete pages.
+ */
+ Property ASSEMBLE_DOCUMENT = Property.externalText(PREFIX+"assemble_document");
+
+
+ /**
+ * Can the user fill in a form
+ */
+ Property FILL_IN_FORM = Property.externalText(PREFIX+"fill_in_form");
+
+ /**
+ * Can the user modify annotations
+ */
+ Property CAN_MODIFY_ANNOTATIONS = Property.externalText(PREFIX+"modify_annotations");
+
+ /**
+ * Can the user print the document
+ */
+ Property CAN_PRINT = Property.externalText(PREFIX+"can_print");
+
+ /**
+ * Can the user print an image-degraded version of the document.
+ */
+ Property CAN_PRINT_DEGRADED = Property.externalText(PREFIX+"can_print_degraded");
+
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java Tue Mar 3 18:51:41 2015
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Checks whether or not a document allows extraction generally
+ * or extraction for accessibility only.
+ */
+public class AccessChecker implements Serializable {
+
+ private static final long serialVersionUID = 6492570218190936986L;
+
+ private final boolean needToCheck;
+ private final boolean allowAccessibility;
+
+ /**
+ * This constructs an {@link AccessChecker} that
+ * will not perform any checking and will always return without
+ * throwing an exception.
+ * <p>
+ * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior.
+ */
+ public AccessChecker() {
+ needToCheck = false;
+ allowAccessibility = true;
+ }
+ /**
+ * This constructs an {@link AccessChecker} that will check
+ * for whether or not content should be extracted from a document.
+ *
+ * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed
+ */
+ public AccessChecker(boolean allowExtractionForAccessibility) {
+ needToCheck = true;
+ this.allowAccessibility = allowExtractionForAccessibility;
+ }
+
+ /**
+ * Checks to see if a document's content should be extracted based
+ * on metadata values and the value of {@link #allowAccessibility} in the constructor.
+ *
+ * @param metadata
+ * @throws AccessPermissionException if access is not permitted
+ */
+ public void check(Metadata metadata) throws AccessPermissionException {
+ if (!needToCheck) {
+ return;
+ }
+ if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
+ if (allowAccessibility) {
+ if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
+ return;
+ }
+ throw new AccessPermissionException("Content extraction for accessibility is not allowed.");
+ }
+ throw new AccessPermissionException("Content extraction is not allowed.");
+ }
+ }
+}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue Mar 3 18:51:41 2015
@@ -39,6 +39,7 @@ import org.apache.pdfbox.io.RandomAccess
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -46,6 +47,7 @@ import org.apache.tika.extractor.Embedde
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.AccessPermissions;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
@@ -140,6 +142,9 @@ public class PDFParser extends AbstractP
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
+
+ AccessChecker checker = localConfig.getAccessChecker();
+ checker.check(metadata);
if (handler != null) {
PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
}
@@ -191,6 +196,28 @@ public class PDFParser extends AbstractP
private void extractMetadata(PDDocument document, Metadata metadata)
throws TikaException {
+ //first extract AccessPermissions
+ AccessPermission ap = document.getCurrentAccessPermission();
+ metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
+ Boolean.toString(ap.canExtractForAccessibility()));
+ metadata.set(AccessPermissions.EXTRACT_CONTENT,
+ Boolean.toString(ap.canExtractContent()));
+ metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
+ Boolean.toString(ap.canAssembleDocument()));
+ metadata.set(AccessPermissions.FILL_IN_FORM,
+ Boolean.toString(ap.canFillInForm()));
+ metadata.set(AccessPermissions.CAN_MODIFY,
+ Boolean.toString(ap.canModify()));
+ metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
+ Boolean.toString(ap.canModifyAnnotations()));
+ metadata.set(AccessPermissions.CAN_PRINT,
+ Boolean.toString(ap.canPrint()));
+ metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
+ Boolean.toString(ap.canPrintDegraded()));
+
+
+
+ //now go for the XMP stuff
org.apache.jempbox.xmp.XMPMetadata xmp = null;
XMPSchemaDublinCore dcSchema = null;
try{
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Tue Mar 3 18:51:41 2015
@@ -14,20 +14,20 @@ package org.apache.tika.parser.pdf;
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-import java.util.Locale;
-import java.util.Properties;
-
-import org.apache.pdfbox.util.PDFTextStripper;
-
-/**
- * Config for PDFParser.
- *
+ * limitations under the License.
+ */
+
+import org.apache.pdfbox.util.PDFTextStripper;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * Config for PDFParser.
+ *
* This allows parameters to be set programmatically:
* <ol>
* <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
@@ -77,12 +77,14 @@ public class PDFParserConfig implements
//The character width-based tolerance value used to estimate where spaces in text should be added
private Float averageCharTolerance;
- //The space width-based tolerance value used to estimate where spaces in text should be added
- private Float spacingTolerance;
-
- public PDFParserConfig() {
- init(this.getClass().getResourceAsStream("PDFParser.properties"));
- }
+ //The space width-based tolerance value used to estimate where spaces in text should be added
+ private Float spacingTolerance;
+
+ private AccessChecker accessChecker;
+
+ public PDFParserConfig() {
+ init(this.getClass().getResourceAsStream("PDFParser.properties"));
+ }
/**
* Loads properties from InputStream and then tries to close InputStream.
@@ -134,13 +136,24 @@ public class PDFParserConfig implements
setExtractInlineImages(
getProp(props.getProperty("extractInlineImages"),
getExtractInlineImages()));
- setExtractUniqueInlineImagesOnly(
- getProp(props.getProperty("extractUniqueInlineImagesOnly"),
- getExtractUniqueInlineImagesOnly()));
- }
-
- /**
- * Configures the given pdf2XHTML.
+ setExtractUniqueInlineImagesOnly(
+ getProp(props.getProperty("extractUniqueInlineImagesOnly"),
+ getExtractUniqueInlineImagesOnly()));
+
+ boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false);
+ boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true);
+
+ if (checkExtractAccessPermission == false) {
+ //silently ignore the crazy configuration of checkExtractAccessPermission = false,
+ //but allowExtractionForAccessibility=false
+ accessChecker = new AccessChecker();
+ } else {
+ accessChecker = new AccessChecker(allowExtractionForAccessibility);
+ }
+ }
+
+ /**
+ * Configures the given pdf2XHTML.
*
* @param pdf2XHTML
*/
@@ -329,12 +342,20 @@ public class PDFParserConfig implements
/**
* See {@link PDFTextStripper#setSpacingTolerance(float)}
- */
- public void setSpacingTolerance(Float spacingTolerance) {
- this.spacingTolerance = spacingTolerance;
- }
-
- private boolean getProp(String p, boolean defaultMissing){
+ */
+ public void setSpacingTolerance(Float spacingTolerance) {
+ this.spacingTolerance = spacingTolerance;
+ }
+
+ public void setAccessChecker(AccessChecker accessChecker) {
+ this.accessChecker = accessChecker;
+ }
+
+ public AccessChecker getAccessChecker() {
+ return accessChecker;
+ }
+
+ private boolean getProp(String p, boolean defaultMissing){
if (p == null){
return defaultMissing;
}
Modified: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties (original)
+++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties Tue Mar 3 18:51:41 2015
@@ -18,6 +18,8 @@ extractAnnotationText true
sortByPosition false
suppressDuplicateOverlappingText false
useNonSequentialParser false
-extractAcroFormContent true
-extractInlineImages false
-extractUniqueInlineImagesOnly true
+extractAcroFormContent true
+extractInlineImages false
+extractUniqueInlineImagesOnly true
+checkExtractAccessPermission false
+allowExtractionForAccessibility true
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Tue Mar 3 18:51:41 2015
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PropertyTypeException;
+import org.junit.Test;
+
+public class AccessCheckerTest {
+
+ @Test
+ public void testLegacy() throws AccessPermissionException{
+
+ Metadata m = getMetadata(false, false);
+ //legacy behavior; don't bother checking
+ AccessChecker checker = new AccessChecker();
+ checker.check(m);
+ assertTrue("no exception", true);
+
+ m = getMetadata(false, true);
+ assertTrue("no exception", true);
+ checker.check(m);
+
+ m = getMetadata(true, true);
+ assertTrue("no exception", true);
+ checker.check(m);
+ }
+
+ @Test
+ public void testNoExtraction() {
+
+ Metadata m = null;
+ //allow nothing
+ AccessChecker checker = new AccessChecker(false);
+ boolean ex = false;
+ try {
+ m = getMetadata(false, false);
+ checker.check(m);
+ } catch (AccessPermissionException e) {
+ ex = true;
+ }
+ assertTrue("correct exception with no extraction, no extract for accessibility", ex);
+ ex = false;
+ try {
+ //document allows extraction for accessibility
+ m = getMetadata(false, true);
+ checker.check(m);
+ } catch (AccessPermissionException e) {
+ //but application is not an accessibility application
+ ex = true;
+ }
+ assertTrue("correct exception with no extraction, no extract for accessibility", ex);
+ }
+
+ @Test
+ public void testExtractOnlyForAccessibility() throws AccessPermissionException {
+ Metadata m = getMetadata(false, true);
+ //allow accessibility
+ AccessChecker checker = new AccessChecker(true);
+ checker.check(m);
+ assertTrue("no exception", true);
+ boolean ex = false;
+ try {
+ m = getMetadata(false, false);
+ checker.check(m);
+ } catch (AccessPermissionException e) {
+ ex = true;
+ }
+ assertTrue("correct exception", ex);
+ }
+
+ @Test
+ public void testCrazyExtractNotForAccessibility() throws AccessPermissionException {
+ Metadata m = getMetadata(true, false);
+ //allow accessibility
+ AccessChecker checker = new AccessChecker(true);
+ checker.check(m);
+ assertTrue("no exception", true);
+
+ //don't extract for accessibility
+ checker = new AccessChecker(false);
+ //if extract content is allowed, the checker shouldn't
+ //check the value of extract for accessibility
+ checker.check(m);
+ assertTrue("no exception", true);
+
+ }
+
+ @Test
+ public void testCantAddMultiplesToMetadata() {
+ Metadata m = new Metadata();
+ boolean ex = false;
+ m.add(AccessPermissions.EXTRACT_CONTENT, "true");
+ try {
+ m.add(AccessPermissions.EXTRACT_CONTENT, "false");
+ } catch (PropertyTypeException e) {
+ ex = true;
+ }
+ assertTrue("can't add multiple values", ex);
+
+ m = new Metadata();
+ ex = false;
+ m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true");
+ try {
+ m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false");
+ } catch (PropertyTypeException e) {
+ ex = true;
+ }
+ assertTrue("can't add multiple values", ex);
+ }
+
+ private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) {
+ Metadata m = new Metadata();
+ m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction));
+ m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(allowExtractionForAccessibility));
+ return m;
+ }
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Mar 3 18:51:41 2015
@@ -32,6 +32,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
@@ -642,17 +643,20 @@ public class PDFParserTest extends TikaT
continue;
}
- pdfs++;
-
String sequentialContent = null;
Metadata sequentialMetadata = new Metadata();
try {
sequentialContent = getText(new FileInputStream(f),
sequentialParser, seqContext, sequentialMetadata);
+ } catch (EncryptedDocumentException e) {
+ //silently skip a file that requires a user password
+ continue;
} catch (Exception e) {
throw new TikaException("Sequential Parser failed on test file " + f, e);
}
+ pdfs++;
+
String nonSequentialContent = null;
Metadata nonSequentialMetadata = new Metadata();
try {
@@ -1138,6 +1142,202 @@ public class PDFParserTest extends TikaT
assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
}
+ //Access checker tests
+
+ @Test
+ public void testLegacyAccessChecking() throws Exception {
+ //test that default behavior doesn't throw AccessPermissionException
+ for (String file : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+ }) {
+ String xml = getXML(file).xml;
+ assertContains("Hello World", xml);
+ }
+
+ //now try with the user password
+ PasswordProvider provider = new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "user";
+ }
+ };
+
+ ParseContext context = new ParseContext();
+ context.set(PasswordProvider.class, provider);
+ Parser parser = new AutoDetectParser();
+
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_user.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+ }) {
+ InputStream stream = null;
+ try {
+ stream = TikaInputStream.get(this.getClass().getResource("/test-documents/"+path));
+ String text = getText(stream, parser, context);
+ assertContains("Hello World", text);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ }
+
+ @Test
+ public void testAccessCheckingEmptyPassword() throws Exception {
+ PDFParserConfig config = new PDFParserConfig();
+
+ //don't allow extraction, not even for accessibility
+ config.setAccessChecker(new AccessChecker(false));
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+
+ //test exception for empty password
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+ }) {
+ assertException("/test-documents/"+path, parser, context, AccessPermissionException.class);
+ }
+
+ config.setAccessChecker(new AccessChecker(true));
+ assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ parser, context, AccessPermissionException.class);
+
+ InputStream is = null;
+ try {
+ is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf");
+ assertContains("Hello World", getText(is, parser, context));
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+
+ @Test
+ public void testAccessCheckingUserPassword() throws Exception {
+ ParseContext context = new ParseContext();
+
+ PDFParserConfig config = new PDFParserConfig();
+ //don't allow extraction, not even for accessibility
+ config.setAccessChecker(new AccessChecker(false));
+ PasswordProvider passwordProvider = new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "user";
+ }
+ };
+
+ context.set(PasswordProvider.class, passwordProvider);
+ context.set(PDFParserConfig.class, config);
+
+ Parser parser = new AutoDetectParser();
+
+ //test bad passwords
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+ }) {
+ assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class);
+ }
+
+ //bad password is still a bad password
+ config.setAccessChecker(new AccessChecker(true));
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+ }) {
+ assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class);
+ }
+
+ //now test documents that require this "user" password
+ assertException("/test-documents/"+"testPDF_no_extract_no_accessibility_owner_user.pdf",
+ parser, context, AccessPermissionException.class);
+
+
+ InputStream is = null;
+ try {
+ is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+ assertContains("Hello World", getText(is, parser, context));
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+
+ config.setAccessChecker(new AccessChecker(false));
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_user.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+ }) {
+ assertException("/test-documents/"+path, parser, context, AccessPermissionException.class);
+ }
+ }
+
+ @Test
+ public void testAccessCheckingOwnerPassword() throws Exception {
+ ParseContext context = new ParseContext();
+
+ PDFParserConfig config = new PDFParserConfig();
+ //don't allow extraction, not even for accessibility
+ config.setAccessChecker(new AccessChecker(true));
+ PasswordProvider passwordProvider = new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "owner";
+ }
+ };
+
+ context.set(PasswordProvider.class, passwordProvider);
+ context.set(PDFParserConfig.class, config);
+
+ Parser parser = new AutoDetectParser();
+ //with owner's password, text can be extracted, no matter the AccessibilityChecker's settings
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_user.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+ "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+ }) {
+
+ InputStream is = null;
+ try {
+ is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+ assertContains("Hello World", getText(is, parser, context));
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+
+ //really, with owner's password, all extraction is allowed
+ config.setAccessChecker(new AccessChecker(false));
+ for (String path : new String[] {
+ "testPDF_no_extract_no_accessibility_owner_user.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+ "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+ }) {
+
+ InputStream is = null;
+ try {
+ is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+ assertContains("Hello World", getText(is, parser, context));
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+ }
+
+ private void assertException(String path, Parser parser, ParseContext context, Class expected) {
+ boolean noEx = false;
+ InputStream is = getResourceAsStream(path);
+ try {
+ String text = getText(is, parser, context);
+ noEx = true;
+ } catch (Exception e) {
+ assertEquals("Not the right exception: "+path, expected, e.getClass());
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ assertFalse(path + " should have thrown exception", noEx);
+ }
/**
*
* Simple class to count end of document events. If functionality is useful,
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf Tue Mar 3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+KßZz&$ùª8^á"
:°iÏËIþ%`8etoiczª´Ð [
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -532
+/O <92EA49CA9DCB5D63ED10DA009E9702A403138C6B0DB22EAD209FC73D70EF86F4>
+/U <A82D4E323C8FE41C5571FA0856FFD74128BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<768A456CFDDEA53BC3965B4569E65812> <768A456CFDDEA53BC3965B4569E65812>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+755
+%%EOF
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf Tue Mar 3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+4æ?$7væ/â=©th;U0ªTdRLGÊÎáZϤ6aóF¯æéÃ^ªD
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -532
+/O <CF2662E6FB01997CC7651E17056D4DFAD2C78DD5F3F4109BDFFB50433BB04670>
+/U <D803EA55DA7821D2A297F8A68387DCA028BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<75DB321CAFE7680CAD6FC09F51F3DDBE> <75DB321CAFE7680CAD6FC09F51F3DDBE>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+755
+%%EOF
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf Tue Mar 3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+õBÓ0Ï6ÜYmñ¤y©mpneÊèÚ¬jÜWü®_WAÐ×D¥Yèõà Vs
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -20
+/O <92EA49CA9DCB5D63ED10DA009E9702A403138C6B0DB22EAD209FC73D70EF86F4>
+/U <472263FD2B9B40403473D05A693D8C0428BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<AFAC4D6B4301475F6B6D846BEACCEA36> <AFAC4D6B4301475F6B6D846BEACCEA36>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+754
+%%EOF
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf?rev=1663764&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf Tue Mar 3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+Ä3×Ö°6fîÒÒ6üòÄ)FDüxîu K^,´Ü^Ìÿ8Q¥Qý$J
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -20
+/O <CF2662E6FB01997CC7651E17056D4DFAD2C78DD5F3F4109BDFFB50433BB04670>
+/U <067DAA91A1AC99D15ABFA0AD86050F3B28BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<B8090A679399BCAD86E31DE615910182> <B8090A679399BCAD86E31DE615910182>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+754
+%%EOF