You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2020/12/14 04:47:02 UTC
[tika] branch branch_1x updated: TIKA-3248: avoid ClassCastException
This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new cc87787 TIKA-3248: avoid ClassCastException
cc87787 is described below
commit cc87787f8899a1266d19b4220ed26413ccaa0a37
Author: Tilman Hausherr <ti...@snafu.de>
AuthorDate: Mon Dec 14 05:46:30 2020 +0100
TIKA-3248: avoid ClassCastException
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2006 ++++++++++----------
1 file changed, 1004 insertions(+), 1002 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index a079ce9..8b25304 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1,1002 +1,1004 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
-
-import javax.xml.stream.XMLStreamException;
-import java.awt.image.BufferedImage;
-import java.io.BufferedInputStream;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.ListIterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.IOUtils;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDPageTree;
-import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
-import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
-import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
-import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
-import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
-import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
-import org.apache.pdfbox.pdmodel.interactive.form.PDField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
-import org.apache.pdfbox.rendering.PDFRenderer;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
-import org.apache.pdfbox.util.Matrix;
-import org.apache.pdfbox.util.Vector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Font;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PDF;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ocr.TesseractOCRConfig;
-import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.parser.sas.SAS7BDATParser;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-class AbstractPDF2XHTML extends PDFTextStripper {
-
- enum ActionTrigger {
- AFTER_DOCUMENT_PRINT,
- AFTER_DOCUMENT_SAVE,
- ANNOTATION_CURSOR_ENTERS,
- ANNOTATION_CURSOR_EXIT,
- ANNOTATION_LOSE_INPUT_FOCUS,
- ANNOTATION_MOUSE_CLICK,
- ANNOTATION_MOUSE_RELEASED,
- ANNOTATION_PAGE_CLOSED,
- ANNOTATION_PAGE_NO_LONGER_VISIBLE,
- ANNOTATION_PAGE_OPENED,
- ANNOTATION_PAGE_VISIBLE,
- ANNOTATION_RECEIVES_FOCUS,
- ANNOTATION_WIDGET,
- BEFORE_DOCUMENT_CLOSE,
- BEFORE_DOCUMENT_PRINT,
- BEFORE_DOCUMENT_SAVE,
- DOCUMENT_OPEN,
- FORM_FIELD,
- FORM_FIELD_FORMATTED,
- FORM_FIELD_KEYSTROKE,
- FORM_FIELD_RECALCULATE,
- FORM_FIELD_VALUE_CHANGE,
- PAGE_CLOSE,
- PAGE_OPEN, BOOKMARK,
- };
-
- /**
- * Maximum recursive depth during AcroForm processing.
- * Prevents theoretical AcroForm recursion bomb.
- */
- private final static int MAX_ACROFORM_RECURSIONS = 10;
-
- private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
-
- private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
- private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
-
- public static final String XMP_DOCUMENT_CATALOG_LOCATION = "documentCatalog";
- public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
-
- /**
- * Format used for signature dates
- * TODO Make this thread-safe
- */
- private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
-
-
- final List<IOException> exceptions = new ArrayList<>();
- final PDDocument pdDocument;
- final XHTMLContentHandler xhtml;
- final ParseContext context;
- final Metadata metadata;
- final EmbeddedDocumentExtractor embeddedDocumentExtractor;
- final PDFParserConfig config;
- final TesseractOCRParser tesseractOCRParser;//can be null!
-
- //zero-based pageIndex
- int pageIndex = 0;
- int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages
- int unmappedUnicodeCharsPerPage = 0;
- int totalCharsPerPage = 0;
-
- private final Set<String> fontNames = new HashSet<>();
-
- AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
- PDFParserConfig config) throws IOException {
- this.pdDocument = pdDocument;
- this.xhtml = new XHTMLContentHandler(handler, metadata);
- this.context = context;
- this.metadata = metadata;
- this.config = config;
- embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
- if (config.getOcrStrategy() == NO_OCR) {
- tesseractOCRParser = null;
- } else {
- tesseractOCRParser = (TesseractOCRParser)EmbeddedDocumentUtil.tryToFindExistingLeafParser(TesseractOCRParser.class, context);
- }
- }
-
- @Override
- protected void startPage(PDPage page) throws IOException {
- try {
- xhtml.startElement("div", "class", "page");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to start a page", e);
- }
- writeParagraphStart();
- }
-
- private void extractXMPXFA(PDDocument pdfDocument, Metadata parentMetadata, ParseContext context) throws IOException, SAXException {
- Set<MediaType> supportedTypes = Collections.EMPTY_SET;
- Parser embeddedParser = context.get(Parser.class);
- if (embeddedParser != null) {
- supportedTypes = embeddedParser.getSupportedTypes(context);
- }
-
- if (supportedTypes == null || supportedTypes.size() == 0) {
- return;
- }
-
- if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
- //try the main metadata
- if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
- try (InputStream is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata()) {
- extractXMPAsEmbeddedFile(is, XMP_DOCUMENT_CATALOG_LOCATION);
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- }
- }
- //now iterate through the pages
- int pageNumber = 1;
- for (PDPage page : pdfDocument.getPages()) {
- if (page.getMetadata() != null) {
- try (InputStream is = page.getMetadata().exportXMPMetadata()) {
- extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX+pageNumber);
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- }
- }
- pageNumber++;
- }
- }
-
- //now try the xfa
- if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
- pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
-
- Metadata xfaMetadata = new Metadata();
- xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
- xfaMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
- if (embeddedDocumentExtractor.shouldParseEmbedded(xfaMetadata) &&
- supportedTypes.contains(XFA_MEDIA_TYPE)) {
- byte[] bytes = null;
- try {
- bytes = pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- }
- if (bytes != null) {
- try (InputStream is = new ByteArrayInputStream(bytes)) {
- parseMetadata(is, xfaMetadata);
- }
- }
- }
- }
- }
-
- private void extractXMPAsEmbeddedFile(InputStream is, String location) throws IOException, SAXException {
- if (is == null) {
- return;
- }
- Metadata xmpMetadata = new Metadata();
- xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
- xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
- xmpMetadata.set(PDF.XMP_LOCATION, location);
- if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
- try {
- parseMetadata(is, xmpMetadata);
- } finally {
- org.apache.tika.io.IOUtils.closeQuietly(is);
- }
- }
-
- }
-
- private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
- try {
- embeddedDocumentExtractor.parseEmbedded(
- stream,
- new EmbeddedContentHandler(xhtml),
- embeddedMetadata, false);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
-
- private void extractEmbeddedDocuments(PDDocument document)
- throws IOException, SAXException, TikaException {
- PDDocumentNameDictionary namesDictionary =
- new PDDocumentNameDictionary(document.getDocumentCatalog());
- PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
- if (efTree == null) {
- return;
- }
-
- Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
- //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
- //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
- //If there is a need we could add a fully recursive search to find a non-null
- //Map<String, COSObjectable> that contains the doc info.
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- } else {
- List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
- if (kids == null) {
- return;
- }
- for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
- embeddedFileNames = node.getNames();
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- }
- }
- }
- }
-
- private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
- if (spec instanceof PDSimpleFileSpecification) {
- attributes.addAttribute("", "class", "class", "CDATA", "linked");
- attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
- } else if (spec instanceof PDComplexFileSpecification){
- if (attributes.getIndex("source") < 0) {
- attributes.addAttribute("", "source", "source", "CDATA", "attachment");
- }
- extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
- }
- }
-
- private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
- throws IOException, SAXException, TikaException {
- if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
- return;
- }
-
- for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
- processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
- }
- }
-
- private void extractMultiOSPDEmbeddedFiles(String displayName,
- PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
- SAXException, TikaException {
-
- if (spec == null) {
- return;
- }
- //current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFile(), spec.getEmbeddedFile(), attributes);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
- }
-
- private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
- String fileName, PDEmbeddedFile file, AttributesImpl attributes)
- throws SAXException, IOException, TikaException {
-
- if (file == null) {
- //skip silently
- return;
- }
-
- fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
- fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
-
- // TODO: other metadata?
- Metadata embeddedMetadata = new Metadata();
- embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
- embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
- embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
- embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
- if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- return;
- }
- TikaInputStream stream = null;
- try {
- stream = TikaInputStream.get(file.createInputStream());
- } catch (IOException e) {
- //store this exception in the parent's metadata
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- return;
- }
- try {
- embeddedDocumentExtractor.parseEmbedded(
- stream,
- new EmbeddedContentHandler(xhtml),
- embeddedMetadata, false);
-
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", fileName);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
- } finally {
- IOUtils.closeQuietly(stream);
- }
-
- }
-
- void handleCatchableIOE(IOException e) throws IOException {
- if (config.getCatchIntermediateIOExceptions()) {
- if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
- e.getCause().getMessage().contains("Your document contained more than")) {
- //TODO -- is there a cleaner way of checking for:
- // WriteOutContentHandler.WriteLimitReachedException?
- throw e;
- }
-
- String msg = e.getMessage();
- if (msg == null) {
- msg = "IOException, no message";
- }
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
- exceptions.add(e);
- } else {
- throw e;
- }
- }
-
- void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
- if (config.getOcrStrategy().equals(NO_OCR)) {
- return;
- }
- TesseractOCRConfig tesseractConfig =
- context.get(TesseractOCRConfig.class, tesseractOCRParser.getDefaultConfig());
-
- if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
- throw new TikaException("Tesseract is not available. "+
- "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
- }
-
- PDFRenderer renderer = new PDFRenderer(pdDocument);
- TemporaryResources tmp = new TemporaryResources();
- try {
-
- int dpi = config.getOcrDPI();
- BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
- Path tmpFile = tmp.createTempFile();
- try (OutputStream os = Files.newOutputStream(tmpFile)) {
- //TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
- os, dpi, config.getOcrImageQuality());
- }
- try (InputStream is = TikaInputStream.get(tmpFile)) {
- tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
- }
- } catch (IOException e) {
- handleCatchableIOE(e);
- } catch (SAXException e) {
- throw new IOExceptionWithCause("error writing OCR content from PDF", e);
- } finally {
- tmp.dispose();
- }
- }
-
- @Override
- protected void endPage(PDPage page) throws IOException {
- metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
- metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
- unmappedUnicodeCharsPerPage);
-
-
- try {
- for (PDAnnotation annotation : page.getAnnotations()) {
-
- if (annotation instanceof PDAnnotationFileAttachment) {
- PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
- PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
- try {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "source", "source", "CDATA", "annotation");
- extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
- } catch (SAXException e) {
- throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
- } catch (TikaException e) {
- throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- } else if (annotation instanceof PDAnnotationWidget) {
- handleWidget((PDAnnotationWidget)annotation);
- }
- // TODO: remove once PDFBOX-1143 is fixed:
- if (config.getExtractAnnotationText()) {
- PDActionURI uri = getActionURI(annotation);
- if (uri != null) {
- String link = uri.getURI();
- if (link != null && link.trim().length() > 0) {
- xhtml.startElement("div", "class", "annotation");
- xhtml.startElement("a", "href", link);
- xhtml.characters(link);
- xhtml.endElement("a");
- xhtml.endElement("div");
- }
- }
-
- if (annotation instanceof PDAnnotationMarkup) {
- PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
- String title = annotationMarkup.getTitlePopup();
- String subject = annotationMarkup.getSubject();
- String contents = annotationMarkup.getContents();
- // TODO: maybe also annotationMarkup.getRichContents()?
- if (title != null || subject != null || contents != null) {
- xhtml.startElement("div", "class", "annotation");
-
- if (title != null) {
- xhtml.startElement("div", "class", "annotationTitle");
- xhtml.characters(title);
- xhtml.endElement("div");
- }
-
- if (subject != null) {
- xhtml.startElement("div", "class", "annotationSubject");
- xhtml.characters(subject);
- xhtml.endElement("div");
- }
-
- if (contents != null) {
- xhtml.startElement("div", "class", "annotationContents");
- xhtml.characters(contents);
- xhtml.endElement("div");
- }
-
- xhtml.endElement("div");
- }
- }
- }
- }
- if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
- doOCROnCurrentPage();
- } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) {
- //TODO add more sophistication
- if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) {
- doOCROnCurrentPage();
- }
- }
-
- PDPageAdditionalActions pageActions = page.getActions();
- if (pageActions != null) {
- handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
- handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
- }
- xhtml.endElement("div");
- } catch (SAXException|TikaException e) {
- throw new IOExceptionWithCause("Unable to end a page", e);
- } catch (IOException e) {
- handleCatchableIOE(e);
- } finally {
- totalCharsPerPage = 0;
- unmappedUnicodeCharsPerPage = 0;
- }
-
- if (config.getExtractFontNames()) {
-
- for (COSName n : page.getResources().getFontNames()) {
- PDFont font = page.getResources().getFont(n);
- if (font != null && font.getFontDescriptor() != null) {
- String fontName = font.getFontDescriptor().getFontName();
- if (fontName != null) {
- fontNames.add(fontName);
- }
- }
- }
- }
- }
-
- private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
- if (widget == null) {
- return;
- }
- handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
- PDAnnotationAdditionalActions annotationActions = widget.getActions();
- if (annotationActions != null) {
- handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
- handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
- handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
- handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
- handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
- handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
- handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
- handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
- handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
- handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
- }
-
- }
-
- @Override
- protected void startDocument(PDDocument pdf) throws IOException {
- try {
- xhtml.startDocument();
- try {
- handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
- } catch (IOException e) {
- //See PDFBOX-3773
- //swallow -- no need to report this
- }
- } catch (TikaException|SAXException e) {
- throw new IOExceptionWithCause("Unable to start a document", e);
- }
- }
-
- private void handleDestinationOrAction(PDDestinationOrAction action,
- ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
- if (action == null || ! config.getExtractActions()) {
- return;
- }
- AttributesImpl attributes = new AttributesImpl();
- String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";
-
- addNonNullAttribute("class", actionOrDestString, attributes);
- addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
- addNonNullAttribute("trigger", actionTrigger.name(), attributes);
-
- if (action instanceof PDActionImportData) {
- processDoc("", ((PDActionImportData)action).getFile(), attributes);
- } else if (action instanceof PDActionLaunch) {
- PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
- addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
- addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
- addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
- addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
- processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
- } else if (action instanceof PDActionRemoteGoTo) {
- PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
- processDoc("", remoteGoTo.getFile(), attributes);
- } else if (action instanceof PDActionJavaScript) {
- PDActionJavaScript jsAction = (PDActionJavaScript)action;
- Metadata m = new Metadata();
- m.set(Metadata.CONTENT_TYPE, "application/javascript");
- m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
- m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
- m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
- String js = jsAction.getAction();
- js = (js == null) ? "" : js;
- if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
- try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
- embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
- }
- }
- addNonNullAttribute("class", "javascript", attributes);
- addNonNullAttribute("type", jsAction.getType(), attributes);
- addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
- } else {
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
- }
- }
-
- private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
- if (name == null || value == null) {
- return;
- }
- attributes.addAttribute("", name, name, "CDATA", value);
- }
-
- @Override
- protected void endDocument(PDDocument pdf) throws IOException {
- try {
- // Extract text for any bookmarks:
- if(config.getExtractBookmarksText()) {
- extractBookmarkText();
- }
-
- try {
- extractEmbeddedDocuments(pdf);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
-
- extractXMPXFA(pdf, metadata, context);
-
- //extract acroform data at end of doc
- if (config.getExtractAcroFormContent() == true) {
- try {
- extractAcroForm(pdf);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
- PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
- handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
- handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
- handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
- handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
- handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
- xhtml.endDocument();
- } catch (TikaException e) {
- throw new IOExceptionWithCause("Unable to end a document", e);
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to end a document", e);
- }
- if (fontNames.size() > 0) {
- for (String fontName : fontNames) {
- metadata.add(Font.FONT_NAME, fontName);
- }
- }
- }
-
- void extractBookmarkText() throws SAXException, IOException, TikaException {
- PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
- if (outline != null) {
- extractBookmarkText(outline);
- }
- }
-
- void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
- PDOutlineItem current = bookmark.getFirstChild();
-
- if (current != null) {
- xhtml.startElement("ul");
- while (current != null) {
- xhtml.startElement("li");
- xhtml.characters(current.getTitle());
- xhtml.endElement("li");
- handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
- // Recurse:
- extractBookmarkText(current);
- current = current.getNextSibling();
- }
- xhtml.endElement("ul");
- }
- }
-
- void extractAcroForm(PDDocument pdf) throws IOException,
- SAXException, TikaException {
- //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
- //this code derives from Ben's code
- PDDocumentCatalog catalog = pdf.getDocumentCatalog();
-
- if (catalog == null)
- return;
-
- PDAcroForm form = catalog.getAcroForm();
- if (form == null)
- return;
-
- //if it has xfa, try that.
- //if it doesn't exist or there's an exception,
- //go with traditional AcroForm
- PDXFAResource pdxfa = form.getXFA();
-
- if (pdxfa != null) {
- //if successful, return
- XFAExtractor xfaExtractor = new XFAExtractor();
- InputStream is = null;
- try {
- is = new BufferedInputStream(
- new ByteArrayInputStream(pdxfa.getBytes()));
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- }
- if (is != null) {
- try {
- xfaExtractor.extract(is, xhtml, metadata, context);
- return;
- } catch (XMLStreamException e) {
- //if there was an xml parse exception in xfa, try the AcroForm
- EmbeddedDocumentUtil.recordException(e, metadata);
- } finally {
- IOUtils.closeQuietly(is);
- }
- }
- }
-
- @SuppressWarnings("rawtypes")
- List fields = form.getFields();
-
- if (fields == null)
- return;
-
- @SuppressWarnings("rawtypes")
- ListIterator itr = fields.listIterator();
-
- if (itr == null)
- return;
-
- xhtml.startElement("div", "class", "acroform");
- xhtml.startElement("ol");
-
- while (itr.hasNext()) {
- Object obj = itr.next();
- if (obj != null && obj instanceof PDField) {
- processAcroField((PDField) obj, 0);
- }
- }
- xhtml.endElement("ol");
- xhtml.endElement("div");
- }
-
- private void processAcroField(PDField field, final int currentRecursiveDepth)
- throws SAXException, IOException, TikaException {
-
- if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
- return;
- }
-
- PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
- if (pdFormFieldAdditionalActions != null) {
- handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE);
- handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED);
- handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE);
- handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE);
- }
- if (field.getWidgets() != null) {
- for (PDAnnotationWidget widget : field.getWidgets()) {
- handleWidget(widget);
- }
- }
-
-
- addFieldString(field);
- if (field instanceof PDNonTerminalField) {
- int r = currentRecursiveDepth + 1;
- xhtml.startElement("ol");
- for (PDField child : ((PDNonTerminalField)field).getChildren()) {
- processAcroField(child, r);
- }
- xhtml.endElement("ol");
- }
- }
-
- private void addFieldString(PDField field) throws SAXException {
- //Pick partial name to present in content and altName for attribute
- //Ignoring FullyQualifiedName for now
- String partName = field.getPartialName();
- String altName = field.getAlternateFieldName();
-
- StringBuilder sb = new StringBuilder();
- AttributesImpl attrs = new AttributesImpl();
-
- if (partName != null) {
- sb.append(partName).append(": ");
- }
- if (altName != null) {
- attrs.addAttribute("", "altName", "altName", "CDATA", altName);
- }
- //return early if PDSignature field
- if (field instanceof PDSignatureField) {
- handleSignature(attrs, (PDSignatureField) field);
- return;
- }
- String value = field.getValueAsString();
- if (value != null && !value.equals("null")) {
- sb.append(value);
- }
-
- if (attrs.getLength() > 0 || sb.length() > 0) {
- xhtml.startElement("li", attrs);
- xhtml.characters(sb.toString());
- xhtml.endElement("li");
- }
- }
-
- private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
- throws SAXException {
-
- PDSignature sig = sigField.getSignature();
- if (sig == null) {
- return;
- }
- Map<String, String> vals = new TreeMap<>();
- vals.put("name", sig.getName());
- vals.put("contactInfo", sig.getContactInfo());
- vals.put("location", sig.getLocation());
- vals.put("reason", sig.getReason());
-
- Calendar cal = sig.getSignDate();
- if (cal != null) {
- dateFormat.setTimeZone(cal.getTimeZone());
- vals.put("date", dateFormat.format(cal.getTime()));
- }
- //see if there is any data
- int nonNull = 0;
- for (String val : vals.keySet()) {
- if (val != null && !val.equals("")) {
- nonNull++;
- }
- }
- //if there is, process it
- if (nonNull > 0) {
- metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
- xhtml.startElement("li", parentAttributes);
-
- AttributesImpl attrs = new AttributesImpl();
- attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
-
- xhtml.startElement("ol", attrs);
- for (Map.Entry<String, String> e : vals.entrySet()) {
- if (e.getValue() == null || e.getValue().equals("")) {
- continue;
- }
- attrs = new AttributesImpl();
- attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
- xhtml.startElement("li", attrs);
- xhtml.characters(e.getValue());
- xhtml.endElement("li");
- }
- xhtml.endElement("ol");
- xhtml.endElement("li");
- }
- }
-
-
- private static PDActionURI getActionURI(PDAnnotation annot) {
- //copied and pasted from PDFBox's PrintURLs
-
- // use reflection to catch all annotation types that have getAction()
- // If you can't use reflection, then check for classes
- // PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a
- // PDActionURI result type
- try {
- Method actionMethod = annot.getClass().getDeclaredMethod("getAction");
- if (actionMethod.getReturnType().equals(PDAction.class)) {
- PDAction action = (PDAction) actionMethod.invoke(annot);
- if (action instanceof PDActionURI) {
- return (PDActionURI) action;
- }
- }
- }
- catch (NoSuchMethodException|IllegalAccessException|InvocationTargetException e) {
- }
- return null;
- }
-
- /**
- * we need to override this because we are overriding {@link #processPages(PDPageTree)}
- * @return
- */
- @Override
- public int getCurrentPageNo() {
- return pageIndex+1;
- }
-
- /**
- * See TIKA-2845 for why we need to override this.
- *
- * @param pages
- * @throws IOException
- */
- @Override
- protected void processPages(PDPageTree pages) throws IOException {
- //we currently need this hack because we aren't able to increment
- //the private currentPageNo in PDFTextStripper,
- //and PDFTextStripper's processPage relies on that variable
- //being >= startPage when deciding whether or not to process a page
- // See:
- // if (currentPageNo >= startPage && currentPageNo <= endPage
- // && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
- // && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
- // {
- super.setStartPage(-1);
- for (PDPage page : pages) {
- if (getCurrentPageNo() >= getStartPage()
- && getCurrentPageNo() <= getEndPage()) {
- processPage(page);
- }
- pageIndex++;
- }
- }
-
- @Override
- public void setStartBookmark(PDOutlineItem pdOutlineItem) {
- throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
- }
-
- @Override
- public void setEndBookmark(PDOutlineItem pdOutlineItem) {
- throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
- }
-
- @Override
- public void setStartPage(int startPage) {
- this.startPage = startPage;
- }
-
- @Override
- public int getStartPage() {
- return startPage;
- }
-
- @Override
- protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException
- {
- super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
- if (unicode == null || unicode.isEmpty()) {
- unmappedUnicodeCharsPerPage++;
- }
- totalCharsPerPage++;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+import javax.xml.stream.XMLStreamException;
+import java.awt.image.BufferedImage;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Font;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.sas.SAS7BDATParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class AbstractPDF2XHTML extends PDFTextStripper {
+
+ enum ActionTrigger {
+ AFTER_DOCUMENT_PRINT,
+ AFTER_DOCUMENT_SAVE,
+ ANNOTATION_CURSOR_ENTERS,
+ ANNOTATION_CURSOR_EXIT,
+ ANNOTATION_LOSE_INPUT_FOCUS,
+ ANNOTATION_MOUSE_CLICK,
+ ANNOTATION_MOUSE_RELEASED,
+ ANNOTATION_PAGE_CLOSED,
+ ANNOTATION_PAGE_NO_LONGER_VISIBLE,
+ ANNOTATION_PAGE_OPENED,
+ ANNOTATION_PAGE_VISIBLE,
+ ANNOTATION_RECEIVES_FOCUS,
+ ANNOTATION_WIDGET,
+ BEFORE_DOCUMENT_CLOSE,
+ BEFORE_DOCUMENT_PRINT,
+ BEFORE_DOCUMENT_SAVE,
+ DOCUMENT_OPEN,
+ FORM_FIELD,
+ FORM_FIELD_FORMATTED,
+ FORM_FIELD_KEYSTROKE,
+ FORM_FIELD_RECALCULATE,
+ FORM_FIELD_VALUE_CHANGE,
+ PAGE_CLOSE,
+ PAGE_OPEN, BOOKMARK,
+ };
+
+ /**
+ * Maximum recursive depth during AcroForm processing.
+ * Prevents theoretical AcroForm recursion bomb.
+ */
+ private final static int MAX_ACROFORM_RECURSIONS = 10;
+
+ private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
+
+ private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
+ private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
+
+ public static final String XMP_DOCUMENT_CATALOG_LOCATION = "documentCatalog";
+ public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
+
+ /**
+ * Format used for signature dates
+ * TODO Make this thread-safe
+ */
+ private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+
+
+ final List<IOException> exceptions = new ArrayList<>();
+ final PDDocument pdDocument;
+ final XHTMLContentHandler xhtml;
+ final ParseContext context;
+ final Metadata metadata;
+ final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+ final PDFParserConfig config;
+ final TesseractOCRParser tesseractOCRParser;//can be null!
+
+ //zero-based pageIndex
+ int pageIndex = 0;
+ int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages
+ int unmappedUnicodeCharsPerPage = 0;
+ int totalCharsPerPage = 0;
+
+ private final Set<String> fontNames = new HashSet<>();
+
+ AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config) throws IOException {
+ this.pdDocument = pdDocument;
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.context = context;
+ this.metadata = metadata;
+ this.config = config;
+ embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ if (config.getOcrStrategy() == NO_OCR) {
+ tesseractOCRParser = null;
+ } else {
+ tesseractOCRParser = (TesseractOCRParser)EmbeddedDocumentUtil.tryToFindExistingLeafParser(TesseractOCRParser.class, context);
+ }
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ xhtml.startElement("div", "class", "page");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a page", e);
+ }
+ writeParagraphStart();
+ }
+
+ private void extractXMPXFA(PDDocument pdfDocument, Metadata parentMetadata, ParseContext context) throws IOException, SAXException {
+ Set<MediaType> supportedTypes = Collections.EMPTY_SET;
+ Parser embeddedParser = context.get(Parser.class);
+ if (embeddedParser != null) {
+ supportedTypes = embeddedParser.getSupportedTypes(context);
+ }
+
+ if (supportedTypes == null || supportedTypes.size() == 0) {
+ return;
+ }
+
+ if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
+ //try the main metadata
+ if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
+ try (InputStream is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata()) {
+ extractXMPAsEmbeddedFile(is, XMP_DOCUMENT_CATALOG_LOCATION);
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ }
+ }
+ //now iterate through the pages
+ int pageNumber = 1;
+ for (PDPage page : pdfDocument.getPages()) {
+ if (page.getMetadata() != null) {
+ try (InputStream is = page.getMetadata().exportXMPMetadata()) {
+ extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX+pageNumber);
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ }
+ }
+ pageNumber++;
+ }
+ }
+
+ //now try the xfa
+ if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
+ pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+
+ Metadata xfaMetadata = new Metadata();
+ xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
+ xfaMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+ if (embeddedDocumentExtractor.shouldParseEmbedded(xfaMetadata) &&
+ supportedTypes.contains(XFA_MEDIA_TYPE)) {
+ byte[] bytes = null;
+ try {
+ bytes = pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ }
+ if (bytes != null) {
+ try (InputStream is = new ByteArrayInputStream(bytes)) {
+ parseMetadata(is, xfaMetadata);
+ }
+ }
+ }
+ }
+ }
+
+ private void extractXMPAsEmbeddedFile(InputStream is, String location) throws IOException, SAXException {
+ if (is == null) {
+ return;
+ }
+ Metadata xmpMetadata = new Metadata();
+ xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
+ xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+ xmpMetadata.set(PDF.XMP_LOCATION, location);
+ if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
+ try {
+ parseMetadata(is, xmpMetadata);
+ } finally {
+ org.apache.tika.io.IOUtils.closeQuietly(is);
+ }
+ }
+
+ }
+
+ private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
+ try {
+ embeddedDocumentExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+
+ private void extractEmbeddedDocuments(PDDocument document)
+ throws IOException, SAXException, TikaException {
+ PDDocumentNameDictionary namesDictionary =
+ new PDDocumentNameDictionary(document.getDocumentCatalog());
+ PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+ if (efTree == null) {
+ return;
+ }
+
+ Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
+ //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
+ //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+ //If there is a need we could add a fully recursive search to find a non-null
+ //Map<String, COSObjectable> that contains the doc info.
+ if (embeddedFileNames != null) {
+ processEmbeddedDocNames(embeddedFileNames);
+ } else {
+ List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+ if (kids == null) {
+ return;
+ }
+ for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+ embeddedFileNames = node.getNames();
+ if (embeddedFileNames != null) {
+ processEmbeddedDocNames(embeddedFileNames);
+ }
+ }
+ }
+ }
+
+ private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
+ if (spec instanceof PDSimpleFileSpecification) {
+ attributes.addAttribute("", "class", "class", "CDATA", "linked");
+ attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } else if (spec instanceof PDComplexFileSpecification){
+ if (attributes.getIndex("source") < 0) {
+ attributes.addAttribute("", "source", "source", "CDATA", "attachment");
+ }
+ extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
+ }
+ }
+
+ private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
+ throws IOException, SAXException, TikaException {
+ if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+ return;
+ }
+
+ for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
+ processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
+ }
+ }
+
+ private void extractMultiOSPDEmbeddedFiles(String displayName,
+ PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
+ SAXException, TikaException {
+
+ if (spec == null) {
+ return;
+ }
+ //current strategy is to pull all, not just first non-null
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFile(), spec.getEmbeddedFile(), attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
+ }
+
+ private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
+ String fileName, PDEmbeddedFile file, AttributesImpl attributes)
+ throws SAXException, IOException, TikaException {
+
+ if (file == null) {
+ //skip silently
+ return;
+ }
+
+ fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
+ fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
+
+ // TODO: other metadata?
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+ embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+ if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ return;
+ }
+ TikaInputStream stream = null;
+ try {
+ stream = TikaInputStream.get(file.createInputStream());
+ } catch (IOException e) {
+ //store this exception in the parent's metadata
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+ return;
+ }
+ try {
+ embeddedDocumentExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
+
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", fileName);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+
+ }
+
+ void handleCatchableIOE(IOException e) throws IOException {
+ if (config.getCatchIntermediateIOExceptions()) {
+ if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
+ e.getCause().getMessage().contains("Your document contained more than")) {
+ //TODO -- is there a cleaner way of checking for:
+ // WriteOutContentHandler.WriteLimitReachedException?
+ throw e;
+ }
+
+ String msg = e.getMessage();
+ if (msg == null) {
+ msg = "IOException, no message";
+ }
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+ exceptions.add(e);
+ } else {
+ throw e;
+ }
+ }
+
+ void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
+ if (config.getOcrStrategy().equals(NO_OCR)) {
+ return;
+ }
+ TesseractOCRConfig tesseractConfig =
+ context.get(TesseractOCRConfig.class, tesseractOCRParser.getDefaultConfig());
+
+ if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
+ throw new TikaException("Tesseract is not available. "+
+ "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
+ }
+
+ PDFRenderer renderer = new PDFRenderer(pdDocument);
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+
+ int dpi = config.getOcrDPI();
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+ Path tmpFile = tmp.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
+ os, dpi, config.getOcrImageQuality());
+ }
+ try (InputStream is = TikaInputStream.get(tmpFile)) {
+ tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
+ }
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("error writing OCR content from PDF", e);
+ } finally {
+ tmp.dispose();
+ }
+ }
+
+ @Override
+ protected void endPage(PDPage page) throws IOException {
+ metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
+ metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
+ unmappedUnicodeCharsPerPage);
+
+
+ try {
+ for (PDAnnotation annotation : page.getAnnotations()) {
+
+ if (annotation instanceof PDAnnotationFileAttachment) {
+ PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
+ if (fann.getFile() instanceof PDComplexFileSpecification) {
+ PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
+ try {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "source", "source", "CDATA", "annotation");
+ extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ } else if (annotation instanceof PDAnnotationWidget) {
+ handleWidget((PDAnnotationWidget)annotation);
+ }
+ // TODO: remove once PDFBOX-1143 is fixed:
+ if (config.getExtractAnnotationText()) {
+ PDActionURI uri = getActionURI(annotation);
+ if (uri != null) {
+ String link = uri.getURI();
+ if (link != null && link.trim().length() > 0) {
+ xhtml.startElement("div", "class", "annotation");
+ xhtml.startElement("a", "href", link);
+ xhtml.characters(link);
+ xhtml.endElement("a");
+ xhtml.endElement("div");
+ }
+ }
+
+ if (annotation instanceof PDAnnotationMarkup) {
+ PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
+ String title = annotationMarkup.getTitlePopup();
+ String subject = annotationMarkup.getSubject();
+ String contents = annotationMarkup.getContents();
+ // TODO: maybe also annotationMarkup.getRichContents()?
+ if (title != null || subject != null || contents != null) {
+ xhtml.startElement("div", "class", "annotation");
+
+ if (title != null) {
+ xhtml.startElement("div", "class", "annotationTitle");
+ xhtml.characters(title);
+ xhtml.endElement("div");
+ }
+
+ if (subject != null) {
+ xhtml.startElement("div", "class", "annotationSubject");
+ xhtml.characters(subject);
+ xhtml.endElement("div");
+ }
+
+ if (contents != null) {
+ xhtml.startElement("div", "class", "annotationContents");
+ xhtml.characters(contents);
+ xhtml.endElement("div");
+ }
+
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+ if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
+ doOCROnCurrentPage();
+ } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) {
+ //TODO add more sophistication
+ if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) {
+ doOCROnCurrentPage();
+ }
+ }
+
+ PDPageAdditionalActions pageActions = page.getActions();
+ if (pageActions != null) {
+ handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
+ handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
+ }
+ xhtml.endElement("div");
+ } catch (SAXException|TikaException e) {
+ throw new IOExceptionWithCause("Unable to end a page", e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ } finally {
+ totalCharsPerPage = 0;
+ unmappedUnicodeCharsPerPage = 0;
+ }
+
+ if (config.getExtractFontNames()) {
+
+ for (COSName n : page.getResources().getFontNames()) {
+ PDFont font = page.getResources().getFont(n);
+ if (font != null && font.getFontDescriptor() != null) {
+ String fontName = font.getFontDescriptor().getFontName();
+ if (fontName != null) {
+ fontNames.add(fontName);
+ }
+ }
+ }
+ }
+ }
+
+ private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
+ if (widget == null) {
+ return;
+ }
+ handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
+ PDAnnotationAdditionalActions annotationActions = widget.getActions();
+ if (annotationActions != null) {
+ handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
+ handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
+ handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
+ handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
+ handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
+ handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
+ handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
+ handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
+ handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
+ handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
+ }
+
+ }
+
+ @Override
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ xhtml.startDocument();
+ try {
+ handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
+ } catch (IOException e) {
+ //See PDFBOX-3773
+ //swallow -- no need to report this
+ }
+ } catch (TikaException|SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a document", e);
+ }
+ }
+
+ private void handleDestinationOrAction(PDDestinationOrAction action,
+ ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
+ if (action == null || ! config.getExtractActions()) {
+ return;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";
+
+ addNonNullAttribute("class", actionOrDestString, attributes);
+ addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
+ addNonNullAttribute("trigger", actionTrigger.name(), attributes);
+
+ if (action instanceof PDActionImportData) {
+ processDoc("", ((PDActionImportData)action).getFile(), attributes);
+ } else if (action instanceof PDActionLaunch) {
+ PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
+ addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
+ addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
+ addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
+ addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
+ processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
+ } else if (action instanceof PDActionRemoteGoTo) {
+ PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
+ processDoc("", remoteGoTo.getFile(), attributes);
+ } else if (action instanceof PDActionJavaScript) {
+ PDActionJavaScript jsAction = (PDActionJavaScript)action;
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "application/javascript");
+ m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
+ m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
+ String js = jsAction.getAction();
+ js = (js == null) ? "" : js;
+ if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+ try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
+ }
+ }
+ addNonNullAttribute("class", "javascript", attributes);
+ addNonNullAttribute("type", jsAction.getType(), attributes);
+ addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } else {
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+
+ private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
+ if (name == null || value == null) {
+ return;
+ }
+ attributes.addAttribute("", name, name, "CDATA", value);
+ }
+
+ @Override
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ // Extract text for any bookmarks:
+ if(config.getExtractBookmarksText()) {
+ extractBookmarkText();
+ }
+
+ try {
+ extractEmbeddedDocuments(pdf);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+
+ extractXMPXFA(pdf, metadata, context);
+
+ //extract acroform data at end of doc
+ if (config.getExtractAcroFormContent() == true) {
+ try {
+ extractAcroForm(pdf);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
+ handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
+ handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
+ handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
+ handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
+ handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
+ xhtml.endDocument();
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ }
+ if (fontNames.size() > 0) {
+ for (String fontName : fontNames) {
+ metadata.add(Font.FONT_NAME, fontName);
+ }
+ }
+ }
+
+ void extractBookmarkText() throws SAXException, IOException, TikaException {
+ PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
+ if (outline != null) {
+ extractBookmarkText(outline);
+ }
+ }
+
+ void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
+ PDOutlineItem current = bookmark.getFirstChild();
+
+ if (current != null) {
+ xhtml.startElement("ul");
+ while (current != null) {
+ xhtml.startElement("li");
+ xhtml.characters(current.getTitle());
+ xhtml.endElement("li");
+ handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
+ // Recurse:
+ extractBookmarkText(current);
+ current = current.getNextSibling();
+ }
+ xhtml.endElement("ul");
+ }
+ }
+
+ void extractAcroForm(PDDocument pdf) throws IOException,
+ SAXException, TikaException {
+ //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
+ //this code derives from Ben's code
+ PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+ if (catalog == null)
+ return;
+
+ PDAcroForm form = catalog.getAcroForm();
+ if (form == null)
+ return;
+
+ //if it has xfa, try that.
+ //if it doesn't exist or there's an exception,
+ //go with traditional AcroForm
+ PDXFAResource pdxfa = form.getXFA();
+
+ if (pdxfa != null) {
+ //if successful, return
+ XFAExtractor xfaExtractor = new XFAExtractor();
+ InputStream is = null;
+ try {
+ is = new BufferedInputStream(
+ new ByteArrayInputStream(pdxfa.getBytes()));
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+ }
+ if (is != null) {
+ try {
+ xfaExtractor.extract(is, xhtml, metadata, context);
+ return;
+ } catch (XMLStreamException e) {
+ //if there was an xml parse exception in xfa, try the AcroForm
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+ }
+
+ @SuppressWarnings("rawtypes")
+ List fields = form.getFields();
+
+ if (fields == null)
+ return;
+
+ @SuppressWarnings("rawtypes")
+ ListIterator itr = fields.listIterator();
+
+ if (itr == null)
+ return;
+
+ xhtml.startElement("div", "class", "acroform");
+ xhtml.startElement("ol");
+
+ while (itr.hasNext()) {
+ Object obj = itr.next();
+ if (obj != null && obj instanceof PDField) {
+ processAcroField((PDField) obj, 0);
+ }
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("div");
+ }
+
+ private void processAcroField(PDField field, final int currentRecursiveDepth)
+ throws SAXException, IOException, TikaException {
+
+ if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+ return;
+ }
+
+ PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
+ if (pdFormFieldAdditionalActions != null) {
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE);
+ }
+ if (field.getWidgets() != null) {
+ for (PDAnnotationWidget widget : field.getWidgets()) {
+ handleWidget(widget);
+ }
+ }
+
+
+ addFieldString(field);
+ if (field instanceof PDNonTerminalField) {
+ int r = currentRecursiveDepth + 1;
+ xhtml.startElement("ol");
+ for (PDField child : ((PDNonTerminalField)field).getChildren()) {
+ processAcroField(child, r);
+ }
+ xhtml.endElement("ol");
+ }
+ }
+
+ private void addFieldString(PDField field) throws SAXException {
+ //Pick partial name to present in content and altName for attribute
+ //Ignoring FullyQualifiedName for now
+ String partName = field.getPartialName();
+ String altName = field.getAlternateFieldName();
+
+ StringBuilder sb = new StringBuilder();
+ AttributesImpl attrs = new AttributesImpl();
+
+ if (partName != null) {
+ sb.append(partName).append(": ");
+ }
+ if (altName != null) {
+ attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+ }
+ //return early if PDSignature field
+ if (field instanceof PDSignatureField) {
+ handleSignature(attrs, (PDSignatureField) field);
+ return;
+ }
+ String value = field.getValueAsString();
+ if (value != null && !value.equals("null")) {
+ sb.append(value);
+ }
+
+ if (attrs.getLength() > 0 || sb.length() > 0) {
+ xhtml.startElement("li", attrs);
+ xhtml.characters(sb.toString());
+ xhtml.endElement("li");
+ }
+ }
+
+ private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
+ throws SAXException {
+
+ PDSignature sig = sigField.getSignature();
+ if (sig == null) {
+ return;
+ }
+ Map<String, String> vals = new TreeMap<>();
+ vals.put("name", sig.getName());
+ vals.put("contactInfo", sig.getContactInfo());
+ vals.put("location", sig.getLocation());
+ vals.put("reason", sig.getReason());
+
+ Calendar cal = sig.getSignDate();
+ if (cal != null) {
+ dateFormat.setTimeZone(cal.getTimeZone());
+ vals.put("date", dateFormat.format(cal.getTime()));
+ }
+ //see if there is any data
+ int nonNull = 0;
+ for (String val : vals.keySet()) {
+ if (val != null && !val.equals("")) {
+ nonNull++;
+ }
+ }
+ //if there is, process it
+ if (nonNull > 0) {
+ metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
+ xhtml.startElement("li", parentAttributes);
+
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+ xhtml.startElement("ol", attrs);
+ for (Map.Entry<String, String> e : vals.entrySet()) {
+ if (e.getValue() == null || e.getValue().equals("")) {
+ continue;
+ }
+ attrs = new AttributesImpl();
+ attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
+ xhtml.startElement("li", attrs);
+ xhtml.characters(e.getValue());
+ xhtml.endElement("li");
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("li");
+ }
+ }
+
+
+ private static PDActionURI getActionURI(PDAnnotation annot) {
+ //copied and pasted from PDFBox's PrintURLs
+
+ // use reflection to catch all annotation types that have getAction()
+ // If you can't use reflection, then check for classes
+ // PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a
+ // PDActionURI result type
+ try {
+ Method actionMethod = annot.getClass().getDeclaredMethod("getAction");
+ if (actionMethod.getReturnType().equals(PDAction.class)) {
+ PDAction action = (PDAction) actionMethod.invoke(annot);
+ if (action instanceof PDActionURI) {
+ return (PDActionURI) action;
+ }
+ }
+ }
+ catch (NoSuchMethodException|IllegalAccessException|InvocationTargetException e) {
+ }
+ return null;
+ }
+
+ /**
+ * we need to override this because we are overriding {@link #processPages(PDPageTree)}
+ * @return
+ */
+ @Override
+ public int getCurrentPageNo() {
+ return pageIndex+1;
+ }
+
+ /**
+ * See TIKA-2845 for why we need to override this.
+ *
+ * @param pages
+ * @throws IOException
+ */
+ @Override
+ protected void processPages(PDPageTree pages) throws IOException {
+ //we currently need this hack because we aren't able to increment
+ //the private currentPageNo in PDFTextStripper,
+ //and PDFTextStripper's processPage relies on that variable
+ //being >= startPage when deciding whether or not to process a page
+ // See:
+ // if (currentPageNo >= startPage && currentPageNo <= endPage
+ // && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
+ // && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
+ // {
+ super.setStartPage(-1);
+ for (PDPage page : pages) {
+ if (getCurrentPageNo() >= getStartPage()
+ && getCurrentPageNo() <= getEndPage()) {
+ processPage(page);
+ }
+ pageIndex++;
+ }
+ }
+
+ @Override
+ public void setStartBookmark(PDOutlineItem pdOutlineItem) {
+ throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
+ }
+
+ @Override
+ public void setEndBookmark(PDOutlineItem pdOutlineItem) {
+ throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
+ }
+
+ @Override
+ public void setStartPage(int startPage) {
+ this.startPage = startPage;
+ }
+
+ @Override
+ public int getStartPage() {
+ return startPage;
+ }
+
+ @Override
+ protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException
+ {
+ super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
+ if (unicode == null || unicode.isEmpty()) {
+ unmappedUnicodeCharsPerPage++;
+ }
+ totalCharsPerPage++;
+ }
+}