You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/02 02:59:18 UTC
tika git commit: TIKA-1857: add basic XFA extraction support via
Pascal Essiembre.
Repository: tika
Updated Branches:
refs/heads/master ed762b702 -> dbefe9830
TIKA-1857: add basic XFA extraction support via Pascal Essiembre.
This closes #74
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dbefe983
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dbefe983
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dbefe983
Branch: refs/heads/master
Commit: dbefe9830b26d05f9ce53503565a069bcc63d7c1
Parents: ed762b7
Author: tballison <ta...@mitre.org>
Authored: Tue Mar 1 20:58:57 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Mar 1 20:58:57 2016 -0500
----------------------------------------------------------------------
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 20 ++
.../org/apache/tika/parser/pdf/PDFParser.java | 35 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 36 ++-
.../apache/tika/parser/pdf/XFAExtractor.java | 318 +++++++++++++++++++
.../apache/tika/parser/pdf/PDFParser.properties | 3 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 32 +-
.../testPDF_XFA_govdocs1_258578.pdf | Bin 0 -> 168176 bytes
7 files changed, 440 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 1ffe60c..d656d5a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.pdf;
+import javax.xml.stream.XMLStreamException;
+import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@@ -63,6 +65,7 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFA;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.tika.exception.TikaException;
@@ -99,6 +102,7 @@ class PDF2XHTML extends PDFTextStripper {
private final ParseContext context;
private final XHTMLContentHandler handler;
private final PDFParserConfig config;
+ private final Metadata metadata;
/**
* This keeps track of the pdf object ids for inline
* images that have been processed.
@@ -121,6 +125,7 @@ class PDF2XHTML extends PDFTextStripper {
this.originalHandler = handler;
this.context = context;
this.handler = new XHTMLContentHandler(handler, metadata);
+ this.metadata = metadata;
}
/**
@@ -581,6 +586,21 @@ class PDF2XHTML extends PDFTextStripper {
if (form == null)
return;
+ //if it has xfa, try that.
+ //if it doesn't exist or there's an exception,
+ //go with traditional AcroForm
+ PDXFA pdxfa = form.getXFA();
+ if (pdxfa != null) {
+ XFAExtractor xfaExtractor = new XFAExtractor();
+ try {
+ xfaExtractor.extract(new BufferedInputStream(
+ new ByteArrayInputStream(pdxfa.getBytes())), handler, metadata);
+ return;
+ } catch (XMLStreamException |IOException e) {
+ //if there was an xml parse exception in xfa, try the AcroForm
+ }
+ }
+
@SuppressWarnings("rawtypes")
List fields = form.getFields();
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 01bbc8a..29ebddf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.pdf;
+import javax.xml.stream.XMLStreamException;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
@@ -56,6 +58,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -145,7 +148,11 @@ public class PDFParser extends AbstractParser {
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
if (handler != null) {
- PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
+ handleXFAOnly(pdfDocument, handler, metadata);
+ } else {
+ PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ }
}
} catch (CryptographyException e) {
@@ -495,6 +502,32 @@ public class PDFParser extends AbstractParser {
}
}
+
+ private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) {
+ if (config.getIfXFAExtractOnlyXFA() &&
+ pdDocument.getDocumentCatalog() != null &&
+ pdDocument.getDocumentCatalog().getAcroForm() != null &&
+ pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+ return true;
+ }
+ return false;
+ }
+
+ private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata)
+ throws SAXException, IOException, TikaException {
+ XFAExtractor ex = new XFAExtractor();
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ try {
+ ex.extract(new ByteArrayInputStream(
+ pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes()),
+ xhtml, metadata);
+ } catch (XMLStreamException e) {
+ throw new TikaException("XML error in XFA", e);
+ }
+ xhtml.endDocument();
+ }
+
public PDFParserConfig getPDFParserConfig() {
return defaultConfig;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 74e67dd..2a650dd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -79,6 +79,10 @@ public class PDFParserConfig implements Serializable {
//The space width-based tolerance value used to estimate where spaces in text should be added
private Float spacingTolerance;
+ //If the PDF has an XFA element, process only that and skip extracting
+ //content from elsewhere in the document.
+ private boolean ifXFAExtractOnlyXFA = false;
+
private AccessChecker accessChecker;
public PDFParserConfig() {
@@ -139,6 +143,10 @@ public class PDFParserConfig implements Serializable {
getProp(props.getProperty("extractUniqueInlineImagesOnly"),
getExtractUniqueInlineImagesOnly()));
+ setIfXFAExtractOnlyXFA(
+ getProp(props.getProperty("ifXFAExtractOnlyXFA"),
+ getIfXFAExtractOnlyXFA()));
+
boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true);
@@ -182,7 +190,8 @@ public class PDFParserConfig implements Serializable {
/**
* If true (the default), extract content from AcroForms
- * at the end of the document.
+ * at the end of the document. If an XFA is found,
+ * try to process that, otherwise, process the AcroForm.
*
* @param extractAcroFormContent
*/
@@ -192,6 +201,26 @@ public class PDFParserConfig implements Serializable {
}
/**
+ * @see #setIfXFAExtractOnlyXFA(boolean)
+ * @return how to handle XFA data if it exists
+ */
+ public boolean getIfXFAExtractOnlyXFA() {
+ return ifXFAExtractOnlyXFA;
+ }
+
+ /**
+ * If false (the default), extract content from the full PDF
+ * as well as the XFA form. This will likely lead to some duplicative
+ * content.
+ *
+ * @param ifXFAExtractOnlyXFA
+ */
+ public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
+ this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
+ }
+
+
+ /**
* @see #setExtractInlineImages(boolean)
*/
public boolean getExtractInlineImages() {
@@ -411,6 +440,7 @@ public class PDFParserConfig implements Serializable {
result = prime * result
+ (suppressDuplicateOverlappingText ? 1231 : 1237);
result = prime * result + (useNonSequentialParser ? 1231 : 1237);
+ result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237);
return result;
}
@@ -449,6 +479,9 @@ public class PDFParserConfig implements Serializable {
return false;
if (useNonSequentialParser != other.useNonSequentialParser)
return false;
+ if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA)
+ return false;
+
return true;
}
@@ -460,6 +493,7 @@ public class PDFParserConfig implements Serializable {
+ extractAnnotationText + ", sortByPosition=" + sortByPosition
+ ", useNonSequentialParser=" + useNonSequentialParser
+ ", extractAcroFormContent=" + extractAcroFormContent
+ + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA
+ ", extractInlineImages=" + extractInlineImages
+ ", extractUniqueInlineImagesOnly="
+ extractUniqueInlineImagesOnly + ", averageCharTolerance="
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
new file mode 100644
index 0000000..3c2b496
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import javax.xml.namespace.QName;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLResolver;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * This class offers an initial capability to
+ * scrape text containing elements out of XFA, and
+ * it tries to link fields with values.
+ * <p>
+ * Some areas for improvement:
+ * <ol>
+ * <li>convert this to 2 lines of XPath</li>
+ * <li>handle metadata stored in <desc> section (govdocs1: 754282.pdf, 982106.pdf)</li>
+ * <li>handle pdf metadata (access permissions, etc.) in <pdf> element</li>
+ * <li>extract different types of uris as metadata</li>
+ * <li>add extraction of <image> data (govdocs1: 754282.pdf)</li>
+ * <li>add computation of traversal order for fields</li>
+ * <li>figure out when text extracted from xfa fields is duplicative of that
+ * extracted from the rest of the pdf...and do this efficiently and quickly</li>
+ * <li>avoid duplication with <speak> and <tooltip> elements</li>
+ * </ol>
+ */
+class XFAExtractor {
+
+ private static final Pattern XFA_TEMPLATE_ANY_VERSION = Pattern.compile("^http://www.xfa.org/schema/xfa-template");
+ private static final Pattern TEXT_PATTERN =
+ Pattern.compile("^(speak|text|contents-richtext|toolTip|exData)$");
+
+ private static final String XFA_DATA_NS = "http://www.xfa.org/schema/xfa-data/1.0/";
+
+ private static final String FIELD_LN = "field";
+ private static final QName XFA_DATA = new QName(XFA_DATA_NS, "data");
+
+ private static final XMLInputFactory factory;
+
+ static {
+ factory = XMLInputFactory.newFactory();
+ factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, true);
+ factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
+ factory.setProperty(XMLInputFactory.IS_VALIDATING, false);
+ factory.setXMLResolver(new XMLResolver() {
+ @Override
+ public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) throws XMLStreamException {
+ return null;
+ }
+ });
+ }
+ private final Matcher xfaTemplateMatcher;//namespace any version
+ private final Matcher textMatcher;
+
+ XFAExtractor() {
+ xfaTemplateMatcher = XFA_TEMPLATE_ANY_VERSION.matcher("");
+ textMatcher = TEXT_PATTERN.matcher("");
+ }
+
+ void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m)
+ throws XMLStreamException, SAXException {
+ xhtml.startElement("div", "class", "xfa_content");
+
+ Map<String, String> pdfObjRToValues = new HashMap<>();
+
+ //for now, store and dump the fields in insertion order
+ Map<String, XFAField> namedFields = new LinkedHashMap<>();
+
+ //The strategy is to cache the fields in fields
+ //and cache the values in pdfObjRToValues while
+ //handling the text etc along the way.
+ //
+ //As a final step, dump the merged fields and the values.
+
+ XMLStreamReader reader = factory.createXMLStreamReader(xfaIs);
+ while (reader.hasNext()) {
+ switch (reader.next()) {
+ case XMLStreamConstants.START_ELEMENT :
+ QName name = reader.getName();
+ String localName = name.getLocalPart();
+ if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() &&
+ FIELD_LN.equals(name.getLocalPart())) {
+ handleField(reader, namedFields);
+ } else if (XFA_DATA.equals(name)) {//full qname match is important!
+ loadData(reader, pdfObjRToValues);
+ } else if (textMatcher.reset(localName).find()) {
+ scrapeTextUntil(reader, xhtml, name);
+ }
+ break;
+ case XMLStreamConstants.END_ELEMENT :
+ break;
+ }
+ }
+
+ if (namedFields.size() == 0) {
+ xhtml.endElement("xfa_content");
+ return;
+ }
+ //now dump fields and values
+ xhtml.startElement("div", "class", "xfa_form");
+ xhtml.startElement("ol");
+ StringBuilder sb = new StringBuilder();
+ for (Map.Entry<String, XFAField> e : namedFields.entrySet()) {
+ String fieldName = e.getKey();
+ XFAField field = e.getValue();
+ String fieldValue = pdfObjRToValues.get(fieldName);
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName);
+
+ String displayFieldName = (field.toolTip == null ||
+ field.toolTip.trim().length() == 0) ? fieldName : field.toolTip;
+
+ sb.append(displayFieldName).append(": ");
+ if (fieldValue != null) {
+ sb.append(fieldValue);
+ }
+
+ xhtml.startElement("li", attrs);
+ xhtml.characters(sb.toString());
+ xhtml.endElement("li");
+ sb.setLength(0);
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("div");
+ xhtml.endElement("xfa_content");
+ }
+
+ //try to scrape the text until the endElement
+ private void scrapeTextUntil(XMLStreamReader reader, XHTMLContentHandler xhtml,
+ QName endElement) throws XMLStreamException, SAXException {
+ StringBuilder buffer = new StringBuilder();
+ boolean keepGoing = true;
+ while (reader.hasNext() && keepGoing) {
+ switch (reader.next()) {
+ case XMLStreamConstants.START_ELEMENT:
+ break;
+ case XMLStreamConstants.CHARACTERS:
+ int start = reader.getTextStart();
+ int length = reader.getTextLength();
+ buffer.append(reader.getTextCharacters(),
+ start,
+ length);
+ break;
+
+ case XMLStreamConstants.CDATA:
+ start = reader.getTextStart();
+ length = reader.getTextLength();
+ buffer.append(reader.getTextCharacters(),
+ start,
+ length);
+ break;
+
+ case (XMLStreamConstants.END_ELEMENT):
+ if (reader.getName().equals(endElement)) {
+ keepGoing = false;
+ } else if ("p".equals(reader.getName().getLocalPart())) {
+ xhtml.element("p", buffer.toString());
+ buffer.setLength(0);
+ }
+ break;
+ }
+ }
+ String remainder = buffer.toString();
+ if (remainder.trim().length() > 0) {
+ xhtml.element("p", remainder);
+ }
+ }
+
+
+ private String scrapeTextUntil(XMLStreamReader reader, QName endElement) throws XMLStreamException {
+ StringBuilder buffer = new StringBuilder();
+ boolean keepGoing = true;
+ while (reader.hasNext() && keepGoing) {
+ switch (reader.next()) {
+ case XMLStreamConstants.START_ELEMENT:
+ break;
+ case XMLStreamConstants.CHARACTERS:
+ int start = reader.getTextStart();
+ int length = reader.getTextLength();
+ buffer.append(reader.getTextCharacters(),
+ start,
+ length);
+ break;
+
+ case XMLStreamConstants.CDATA:
+ start = reader.getTextStart();
+ length = reader.getTextLength();
+ buffer.append(reader.getTextCharacters(),
+ start,
+ length);
+ break;
+
+ case (XMLStreamConstants.END_ELEMENT):
+ if (reader.getName().equals(endElement)) {
+ keepGoing = false;
+ } else if ("p".equals(reader.getName().getLocalPart())) {
+ buffer.append("\n");
+ }
+ break;
+ }
+ }
+ return buffer.toString();
+ }
+
+ private void loadData(XMLStreamReader reader, Map<String, String> pdfObjRToValues)
+ throws XMLStreamException {
+ //reader is at the "xfa:data" element
+ while (reader.hasNext()) {
+ switch (reader.next()) {
+ case (XMLStreamConstants.START_ELEMENT) :
+ if ("topmostSubform".equals(reader.getLocalName())) {
+ continue;
+ }
+ String value = scrapeTextUntil(reader, reader.getName());
+ pdfObjRToValues.put(reader.getLocalName(), value);
+ break;
+ case (XMLStreamConstants.END_ELEMENT) :
+ if (XFA_DATA.equals(reader.getName())) {
+ return;
+ }
+ break;
+
+ }
+ }
+ }
+
+ private void handleField(XMLStreamReader reader, Map<String, XFAField> fields) throws XMLStreamException {
+ //reader is set to the field element
+ String fieldName = findFirstAttributeValue(reader, "name");
+ String pdfObjRef = "";
+ String toolTip = "";
+ while (reader.hasNext()) {
+ switch (reader.next()) {
+ case XMLStreamConstants.START_ELEMENT :
+ if ("toolTip".equals(reader.getName().getLocalPart())) {
+ toolTip = scrapeTextUntil(reader, reader.getName());
+ }
+ // add checkbutton, etcif (reader.getName().equals())
+ break;
+ case XMLStreamConstants.END_ELEMENT :
+ if (xfaTemplateMatcher.reset(reader.getName().getNamespaceURI()).find() &&
+ FIELD_LN.equals(reader.getName().getLocalPart())) {
+ if (fieldName != null) {
+ fields.put(fieldName, new XFAField(fieldName, toolTip, pdfObjRef));
+ }
+ return;
+ }
+ break;
+ case XMLStreamConstants.PROCESSING_INSTRUCTION:
+ if ("PDF_OBJR".equals(reader.getPITarget())) {
+ pdfObjRef = reader.getPIData();
+ }
+ break;
+
+ }
+ }
+ }
+
+ private String findFirstAttributeValue(XMLStreamReader reader, String name) {
+ for (int i = 0; i < reader.getAttributeCount(); i++) {
+ String n = reader.getAttributeLocalName(i);
+ if (name.equals(n)) {
+ return reader.getAttributeValue(i);
+ }
+ }
+ return "";
+ }
+
+ class XFAField {
+ String fieldName;
+ String toolTip;
+ String pdfObjRef;
+ String value;
+
+ public XFAField(String fieldName, String toolTip, String pdfObjRef) {
+ this.fieldName = fieldName;
+ this.toolTip = toolTip;
+ this.pdfObjRef = pdfObjRef;
+ }
+
+ @Override
+ public String toString() {
+ return "XFAField{" +
+ "fieldName='" + fieldName + '\'' +
+ ", toolTip='" + toolTip + '\'' +
+ ", pdfObjRef='" + pdfObjRef + '\'' +
+ ", value='" + value + '\'' +
+ '}';
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
index 1585f2d..bcfe1c6 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-enableAutospace true
+enableAutoSpace true
extractAnnotationText true
sortByPosition false
suppressDuplicateOverlappingText false
@@ -23,3 +23,4 @@ extractInlineImages false
extractUniqueInlineImagesOnly true
checkExtractAccessPermission false
allowExtractionForAccessibility true
+ifXFAExtractOnlyXFA false
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 581faaa..04d9f2b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -24,13 +24,13 @@ import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
-
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
@@ -1328,6 +1328,36 @@ public class PDFParserTest extends TikaTest {
assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE));
}
+ @Test
+ public void testXFAExtractionBasic() throws Exception {
+ XMLResult r = getXML("testPDF_XFA_govdocs1_258578.pdf");
+ //contains content existing only in the "regular" pdf
+ assertContains("Mount Rushmore National Memorial", r.xml);
+ //contains xfa fields and data
+ assertContains("<li fieldName=\"School_Name\">School Name: my_school</li>",
+ r.xml);
+ }
+
+ @Test
+ public void testXFAOnly() throws Exception {
+ ParseContext context = new ParseContext();
+
+ PDFParserConfig config = new PDFParserConfig();
+ config.setIfXFAExtractOnlyXFA(true);
+ context.set(PDFParserConfig.class, config);
+ ContentHandler handler = new ToXMLContentHandler(StandardCharsets.UTF_8.name());
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ try (InputStream is = getResourceAsStream("/test-documents/testPDF_XFA_govdocs1_258578.pdf")) {
+ parser.parse(is, handler, metadata, context);
+ }
+ String xml = handler.toString();
+ assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml);
+ assertContains("</xfa_content></body></html>", xml);
+
+ assertNotContained("Mount Rushmore National Memorial", xml);
+ }
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);
http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf
new file mode 100644
index 0000000..e3fb803
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf differ